From 7485d93aa5df3c22c3881f94d7dfab47ab4027cc Mon Sep 17 00:00:00 2001 From: zPlus Date: Thu, 23 Nov 2023 08:33:28 +0100 Subject: [PATCH] Replace urllib.parse.quote() with percent_encode(). --- scripts/rdf.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/rdf.py b/scripts/rdf.py index 9048145..8c774e2 100755 --- a/scripts/rdf.py +++ b/scripts/rdf.py @@ -4,7 +4,6 @@ import os import pathlib import rdflib import sys -import urllib.parse from rdflib import BNode, Graph, Literal, Namespace, URIRef from rdflib.namespace import RDF @@ -20,6 +19,18 @@ MANPAGE = Namespace('dokk:manpages:') # A graph to store all the triples g = Graph() +def percent_encode(string): + """ + A few manpages contain special characters that are not valid symbols in a URL's + path. Since we use node URIs like , we need + to percent-encode these otherwise the URLs are invalid. + The reason for using a custom method instead of urllib.parse.quote() is that quote() + will percent-encode *any* non ASCII character such as non-latin characters. + """ + + return string.replace(' ', '_') \ + .replace('#', '%23') + for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): if not absolute_file_path.is_file(): exit('Not a file: {}'.format(absolute_file_path)) @@ -64,7 +75,7 @@ for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): # because I haven't got enough RAM for storing thousands of pages. g_page = Graph() - page_ref = URIRef('dokk:manpages:debian/' + urllib.parse.quote(f'{distro_number}/{distro_package}/{filename}')) + page_ref = URIRef('dokk:manpages:debian/' + percent_encode(f'{distro_number}/{distro_package}/{filename}')) g_page.add((page_ref, RDF.type, URIRef(MANPAGE.Page))) g_page.add((page_ref, URIRef(MANPAGE.filename), Literal(filename))) g_page.add((page_ref, URIRef(MANPAGE.name), Literal(name))) @@ -82,14 +93,14 @@ for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): # Create a graph node for this package # Link to the page node - package_ref = URIRef('dokk:manpages:debian/' + urllib.parse.quote(f'{distro_number}/{distro_package}')) + package_ref = URIRef('dokk:manpages:debian/' + percent_encode(f'{distro_number}/{distro_package}')) g.add((package_ref, RDF.type, URIRef(MANPAGE.Package))) g.add((package_ref, MANPAGE.name, Literal(distro_package))) g.add((package_ref, MANPAGE.page, page_ref)) # Create a graph node for this distro # Link to the package node - distro_ref = URIRef('dokk:manpages:debian/' + urllib.parse.quote(f'{distro_number}')) + distro_ref = URIRef('dokk:manpages:debian/' + percent_encode(f'{distro_number}')) g.add((distro_ref, RDF.type, URIRef(MANPAGE.Distribution))) g.add((distro_ref, MANPAGE.name, Literal('debian'))) g.add((distro_ref, MANPAGE.codename, Literal(distro_codename)))