#!/usr/bin/env python3
import os
import pathlib
import rdflib
import sys
from rdflib import BNode, Graph, Literal, Namespace, URIRef
from rdflib.namespace import RDF
CODENAME = os.getenv('CODENAME')
DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR')
if not CODENAME:
print('envvar CODENAME is not defined')
exit()
if not DEBIMAN_SERVING_DIR:
print('envvar DEBIMAN_SERVING_DIR is not defined')
exit()
MANPAGE = Namespace('dokk:manpages:')
# A graph to store all the triples
g = Graph()
def percent_encode(string):
"""
A few manpages contain special characters in their name that are not valid in a URL's
path. Since we use node URIs like <dokk:manpages:distro/package/name> we need to
%-encode these, otherwise the URLs are invalid.
The reason for using a custom method instead of urllib.parse.quote() is that urllib
only works with URLs but not IRIs, therefore quote() will %-encode *any* non-ASCII
character.
"""
return string.replace(' ', '%20') \
.replace('#', '%23') \
.replace('[', '%5B') \
.replace(']', '%5D')
for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR, CODENAME.lower()).glob('**/*.roff'):
if not absolute_file_path.is_file():
exit('Not a file: {}'.format(absolute_file_path))
# Remove the DEBIMAN_SERVING_DIR prefix from the path
# Also remove .roff suffix from the filename
file_path = absolute_file_path.relative_to(DEBIMAN_SERVING_DIR).with_suffix('')
file_parts = list(file_path.parts)
distro_codename, distro_package, filename = file_parts[0], file_parts[1], file_parts[2]
# Replace debian codenames with version numbers
if distro_codename == 'buster': distro_number = 10
elif distro_codename == 'bullseye': distro_number = 11
elif distro_codename == 'bookworm': distro_number = 12
else: exit('Distro codename not recognized.')
name, section, language = filename.rsplit('.', 2)
section_number, subsection = int(section[:1]), section[1:]
# Read files
try:
with open(absolute_file_path, 'r') as f:
roff = f.read()
except:
roff = ''
try:
with open(f'{absolute_file_path}.txt', 'r') as f:
plaintext = f.read()
except:
plaintext = ''
try:
with open(f'{absolute_file_path}.html', 'r') as f:
html = f.read()
except:
html = ''
# Create a new graph for this manpage that is printed out immediately
# because I haven't got enough RAM for storing the whole graph.
g_page = Graph()
page_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}/{distro_package}/{filename}'))
g_page.add((page_ref, RDF.type, URIRef(MANPAGE.Page)))
g_page.add((page_ref, URIRef(MANPAGE.filename), Literal(filename)))
g_page.add((page_ref, URIRef(MANPAGE.name), Literal(name)))
g_page.add((page_ref, URIRef(MANPAGE.name_lowercase), Literal(name.lower())))
g_page.add((page_ref, URIRef(MANPAGE.section), Literal(section)))
g_page.add((page_ref, URIRef(MANPAGE.section_lowercase), Literal(section.lower())))
g_page.add((page_ref, URIRef(MANPAGE.section_number), Literal(section_number)))
g_page.add((page_ref, URIRef(MANPAGE.subsection), Literal(subsection)))
g_page.add((page_ref, URIRef(MANPAGE.language), Literal(language)))
g_page.add((page_ref, URIRef(MANPAGE.roff), Literal(roff)))
g_page.add((page_ref, URIRef(MANPAGE.plaintext), Literal(plaintext)))
g_page.add((page_ref, URIRef(MANPAGE.html), Literal(html)))
print(g_page.serialize(format='nt'))
# Now we're going to create nodes for the debian distro as well as the package
# this page belongs to.
# Create a graph node for this package
# Link to the page node
package_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}/{distro_package}'))
g.add((package_ref, RDF.type, URIRef(MANPAGE.Package)))
g.add((package_ref, MANPAGE.name, Literal(distro_package)))
g.add((package_ref, MANPAGE.page, page_ref))
# Create a graph node for this distro
# Link to the package node
distro_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}'))
g.add((distro_ref, RDF.type, URIRef(MANPAGE.Distribution)))
g.add((distro_ref, MANPAGE.name, Literal('debian')))
g.add((distro_ref, MANPAGE.codename, Literal(distro_codename)))
g.add((distro_ref, MANPAGE.number, Literal(distro_number)))
g.add((distro_ref, MANPAGE.package, package_ref))
# Print out the graph
print(g.serialize(format='nt'))