From 66e080219d358276251db1783baa98932fedae5b Mon Sep 17 00:00:00 2001 From: zPlus Date: Sat, 8 Jul 2023 07:04:33 +0200 Subject: [PATCH] Initial commit --- README | 22 ++++++++++++ scripts/Makefile | 35 +++++++++++++++++++ scripts/rdf.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) create mode 100644 README create mode 100755 scripts/Makefile create mode 100755 scripts/rdf.py diff --git a/README b/README new file mode 100644 index 0000000..f52f445 --- /dev/null +++ b/README @@ -0,0 +1,22 @@ +Software prerequisites: apt-cacher-ng debiman mandoc python3 + +After installing apt-cacher-ng, the proxy is started automatically and should be +listening on port 3142. +debiman by default will download from http://localhost:3142/deb.debian.org/ +This setting can be changed with -local_mirror or -remote_mirror + +Download manpages from Debian: + + $ DEBIMAN_SERVING_DIR=/absolue/path make download + +Extract all the pages from .gz: + + $ DEBIMAN_SERVING_DIR=/absolue/path make download + +Convert manpages to plaintext: + + $ DEBIMAN_SERVING_DIR=/absolue/path make convert + +Create RDF graph: + + $ DEBIMAN_SERVING_DIR=/absolue/path ./rdf.py diff --git a/scripts/Makefile b/scripts/Makefile new file mode 100755 index 0000000..12f02b2 --- /dev/null +++ b/scripts/Makefile @@ -0,0 +1,35 @@ +SHELL = /bin/bash +DEBIMAN_SERVING_DIR ?= ./ + + +# This just prints out variables for displaying +vars: + @echo "Using envvar DEBIMAN_SERVING_DIR = ${DEBIMAN_SERVING_DIR}" + + +# Note: the behaviour of debiman is to download all the manpages, then render them all. +# This cannot be changed. Since we're only interested in the raw manpages and not +# the HTML output, -only_render_pkgs is a hack that will make debiman render only one +# page (0ad) and quit. +download: vars + debiman -sync_codenames="bookworm" -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad" + +# Extract downloaded pages since they're compressed by default +extract: vars + find "${DEBIMAN_SERVING_DIR}" -type f,l -name "*.gz" -exec gunzip --decompress --force --keep "{}" \; + +# Convert manpages from roff to plaintext +# Manpage files are named "page.section.lang". +# TODO ! -name "stress-ng.1.en" ! -name "md.4.en" +# this is a hack for skipping the rendering of those pages. The version of mandoc +# in Debian is outdated and gets stuck in a infinite loop. Remove this hack if +# using a more recent mandoc. +convert: vars + while IFS= read -r file; do \ + echo "$${file}"; \ + cp "$${file}" "$${file}.roff"; \ + mandoc -T utf8 "$${file}" > "$${file}.roff.txt"; \ + mandoc -T html -O fragment "$${file}" > "$${file}.roff.html"; \ + done < <( find "${DEBIMAN_SERVING_DIR}" -type f -name "*.*.*" ! -name "*.gz" ! -name "*.roff" ! -name "*.txt" ! -name "*.html" ! -name "stress-ng.1.en" ! -name "md.4.en" ) + +.PHONY: vars download extract diff --git a/scripts/rdf.py b/scripts/rdf.py new file mode 100755 index 0000000..8763df6 --- /dev/null +++ b/scripts/rdf.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import os +import pathlib +import rdflib +import sys + +from rdflib import BNode, Graph, Literal, Namespace, URIRef +from rdflib.namespace import RDF + +DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR') + +if not DEBIMAN_SERVING_DIR: + print('envvar DEBIMAN_SERVING_DIR is not defined') + exit() + +for absolute_file_path in sorted(pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff')): + if not absolute_file_path.is_file(): + print('Not a file: {}'.format(absolute_file_path)) + continue + + # Remove the DEBIMAN_SERVING_DIR prefix from the path + # Also remove .roff suffix from the filename + file = absolute_file_path.relative_to(DEBIMAN_SERVING_DIR).with_suffix('') + file_parts = list(file.parts) + + # Replace debian codenames with version numbers + if file_parts[0] == 'debian': + file_parts[1] = file_parts[1].replace('bookworm', '12') + file = pathlib.Path(*file_parts) + + name, section, language = file.name.rsplit('.', 2) + section_number, subsection = int(section[:1]), section[1:] + + # Fix characters that cannot be used as valid URIs + id = str(file).replace(' ', '_') \ + .replace('[', '%5B') \ + .replace(']', '%5D') \ + .replace('#', '%23') + + # Read files + try: + with open(absolute_file_path, 'r') as f: + roff = f.read() + except: + roff = '' + + try: + with open(f'{absolute_file_path}.txt', 'r') as f: + plaintext = f.read() + except: + plaintext = '' + + try: + with open(f'{absolute_file_path}.html', 'r') as f: + html = f.read() + except: + html = '' + + MANPAGE = Namespace('dokk:manpages:') + + turtle = f""" + {URIRef(MANPAGE[id]).n3()} + {URIRef(MANPAGE.name).n3()} {Literal(name).n3()} ; + {URIRef(MANPAGE.name_lowercase).n3()} {Literal(name.lower()).n3()} ; + {URIRef(MANPAGE.section).n3()} {Literal(section).n3()} ; + {URIRef(MANPAGE.section_lowercase).n3()} {Literal(section.lower()).n3()} ; + {URIRef(MANPAGE.section_number).n3()} {Literal(section_number).n3()} ; + {URIRef(MANPAGE.subsection).n3()} {Literal(subsection).n3()} ; + {URIRef(MANPAGE.language).n3()} {Literal(language).n3()} ; + {URIRef(MANPAGE.roff).n3()} {Literal(roff).n3()} ; + {URIRef(MANPAGE.plaintext).n3()} {Literal(plaintext).n3()} ; + {URIRef(MANPAGE.html).n3()} {Literal(html).n3()} ; + """ + + if file_parts[0] == 'debian': + turtle += f""" + {URIRef(MANPAGE.source).n3()} [ + {URIRef(MANPAGE.distribution_name).n3()} {Literal(file_parts[0]).n3()} ; + {URIRef(MANPAGE.distribution_version).n3()} {Literal(file_parts[1]).n3()} ; + {URIRef(MANPAGE.package).n3()} {Literal(file_parts[2]).n3()} ; + {URIRef(MANPAGE.filename).n3()} {Literal(file.name).n3()} ; + ] . + """ + + mangraph = Graph().parse(publicID='', format='turtle', data=turtle) + + triples = mangraph.serialize(destination=None, format='nt') + sys.stdout.write(triples)