home » dokk/manpages.git
ID: 9048145c62e9a08e2e8250f6884f8571b3f38989
100 lines — 4K — View raw


#!/usr/bin/env python3

import os
import pathlib
import rdflib
import sys
import urllib.parse

from rdflib import BNode, Graph, Literal, Namespace, URIRef
from rdflib.namespace import RDF

DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR')

if not DEBIMAN_SERVING_DIR:
    print('envvar DEBIMAN_SERVING_DIR is not defined')
    exit()

MANPAGE = Namespace('dokk:manpages:')

# A graph to store all the triples
g = Graph()

for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'):
    if not absolute_file_path.is_file():
        exit('Not a file: {}'.format(absolute_file_path))

    # Remove the DEBIMAN_SERVING_DIR prefix from the path
    # Also remove .roff suffix from the filename
    file_path = absolute_file_path.relative_to(DEBIMAN_SERVING_DIR).with_suffix('')
    file_parts = list(file_path.parts)

    distro_codename, distro_package, filename = file_parts[0], file_parts[1], file_parts[2]

    # Replace debian codenames with version numbers
    if   distro_codename == 'buster':   distro_number = 10
    elif distro_codename == 'bullseye': distro_number = 11
    elif distro_codename == 'bookworm': distro_number = 12
    else: exit('Distro codename not recognized.')

    name, section, language = filename.rsplit('.', 2)
    section_number, subsection = int(section[:1]), section[1:]

    # Read files
    try:
        with open(absolute_file_path, 'r') as f:
            roff = f.read()
    except:
        roff = ''

    try:
        with open(f'{absolute_file_path}.txt', 'r') as f:
            plaintext = f.read()
    except:
        plaintext = ''

    try:
        with open(f'{absolute_file_path}.html', 'r') as f:
            html = f.read()
    except:
        html = ''

    # Create a separate graph node for this manpage
    # A separate, temporary graph for a manpage that is printed out immediately
    # because I haven't got enough RAM for storing thousands of pages.
    g_page = Graph()

    page_ref = URIRef('dokk:manpages:debian/' + urllib.parse.quote(f'{distro_number}/{distro_package}/{filename}'))
    g_page.add((page_ref, RDF.type,                          URIRef(MANPAGE.Page)))
    g_page.add((page_ref, URIRef(MANPAGE.filename),          Literal(filename)))
    g_page.add((page_ref, URIRef(MANPAGE.name),              Literal(name)))
    g_page.add((page_ref, URIRef(MANPAGE.name_lowercase),    Literal(name.lower())))
    g_page.add((page_ref, URIRef(MANPAGE.section),           Literal(section)))
    g_page.add((page_ref, URIRef(MANPAGE.section_lowercase), Literal(section.lower())))
    g_page.add((page_ref, URIRef(MANPAGE.section_number),    Literal(section_number)))
    g_page.add((page_ref, URIRef(MANPAGE.subsection),        Literal(subsection)))
    g_page.add((page_ref, URIRef(MANPAGE.language),          Literal(language)))
    g_page.add((page_ref, URIRef(MANPAGE.roff),              Literal(roff)))
    g_page.add((page_ref, URIRef(MANPAGE.plaintext),         Literal(plaintext)))
    g_page.add((page_ref, URIRef(MANPAGE.html),              Literal(html)))

    print(g_page.serialize(format='nt'))

    # Create a graph node for this package
    # Link to the page node
    package_ref = URIRef('dokk:manpages:debian/' + urllib.parse.quote(f'{distro_number}/{distro_package}'))
    g.add((package_ref, RDF.type,       URIRef(MANPAGE.Package)))
    g.add((package_ref, MANPAGE.name,   Literal(distro_package)))
    g.add((package_ref, MANPAGE.page,   page_ref))

    # Create a graph node for this distro
    # Link to the package node
    distro_ref = URIRef('dokk:manpages:debian/' + urllib.parse.quote(f'{distro_number}'))
    g.add((distro_ref, RDF.type,            URIRef(MANPAGE.Distribution)))
    g.add((distro_ref, MANPAGE.name,        Literal('debian')))
    g.add((distro_ref, MANPAGE.codename,    Literal(distro_codename)))
    g.add((distro_ref, MANPAGE.number,      Literal(distro_number)))
    g.add((distro_ref, MANPAGE.package,     package_ref))

# Print out the graph
print(g.serialize(format='nt'))