diff --git a/README b/README index b9e9121..9bc1022 100644 --- a/README +++ b/README @@ -1,24 +1,33 @@ -Software prerequisites: apt-cacher-ng debiman mandoc python3 +Software prerequisites: + - apt-cacher-ng + - debiman (install from repository, follow instructions in README) + - mandoc + - python3 + - python3-venv After installing apt-cacher-ng, the proxy is started automatically and should be listening on port 3142. debiman by default will download from http://localhost:3142/deb.debian.org/ -This setting can be changed with -local_mirror or -remote_mirror +This option can be changed with -local_mirror or -remote_mirror. Non-LTS Debians are +removed from the mirrors; use the archive instead: https://archive.debian.org How to use the scripts: 1. Download manpages from Debian: - $ DEBIMAN_SERVING_DIR=/absolue/path make download + $ MIRROR="https://archive.debian.org" CODENAME="buster" DEBIMAN_SERVING_DIR=/path make download 2. Extract all the pages from .gz: - $ DEBIMAN_SERVING_DIR=/absolue/path make extract + $ DEBIMAN_SERVING_DIR=/path make extract 3. Convert manpages to plaintext and html: - $ DEBIMAN_SERVING_DIR=/absolue/path make convert + $ DEBIMAN_SERVING_DIR=/path make convert 4. Create RDF graph: - $ DEBIMAN_SERVING_DIR=/absolue/path ./rdf.py + $ python3 -m venv venv + $ source venv/bin/activate + $ pip install rdflib + $ CODENAME="buster" DEBIMAN_SERVING_DIR=/path ./rdf.py diff --git a/scripts/Makefile b/scripts/Makefile index 3d83416..cc4485f 100755 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -1,31 +1,35 @@ SHELL = /bin/bash -# THIS PATH MUST BE ABSOLUTE. +# The mirror to download from +MIRROR ?= + # The path where debiman will save files. -# There is a bug in debiman, it won't accept a relative path -DEBIMAN_SERVING_DIR ?= ./ +DEBIMAN_SERVING_DIR ?= +# Name of the distribution to be processed +CODENAME ?= # This just prints out variables for displaying vars: - @echo "Using envvar DEBIMAN_SERVING_DIR = ${DEBIMAN_SERVING_DIR}" - @echo "If you get errors, this path MUST be absolute." + @echo "MIRROR=${MIRROR}" + @echo "CODENAME=${CODENAME}" + @echo "DEBIMAN_SERVING_DIR=${DEBIMAN_SERVING_DIR}" # Note: the behaviour of debiman is to download all the manpages, then render them all. # This cannot be changed. Since we're only interested in the raw manpages and not # the HTML output, -only_render_pkgs is a hack that will make debiman render only one # page (0ad) and quit. download: vars - debiman -sync_codenames="buster,bullseye,bookworm" -sync_suites= -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad" + debiman -remote_mirror="${MIRROR}" -sync_codenames="${CODENAME}" -sync_suites= -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad" # Extract downloaded pages since they're compressed by default extract: vars find "${DEBIMAN_SERVING_DIR}" -type f,l -name "*.gz" -exec gunzip --decompress --force --keep "{}" \; -# Convert manpages from roff to plaintext +# Convert manpages from roff to other formats # Manpage files are named "page.section.lang". # TODO ! -name "stress-ng.1.en" ! -name "md.4.en" -# this is a hack for skipping the rendering of those pages. The version of mandoc +# this is a hack for skipping the parsing of those pages. The version of mandoc # in Debian is outdated and gets stuck in a infinite loop. Remove this hack if # using a more recent mandoc. convert: vars diff --git a/scripts/rdf.py b/scripts/rdf.py index f31daf9..3db52db 100755 --- a/scripts/rdf.py +++ b/scripts/rdf.py @@ -8,8 +8,13 @@ import sys from rdflib import BNode, Graph, Literal, Namespace, URIRef from rdflib.namespace import RDF +CODENAME = os.getenv('CODENAME') DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR') +if not CODENAME: + print('envvar CODENAME is not defined') + exit() + if not DEBIMAN_SERVING_DIR: print('envvar DEBIMAN_SERVING_DIR is not defined') exit() @@ -34,7 +39,7 @@ def percent_encode(string): .replace('[', '%5B') \ .replace(']', '%5D') -for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): +for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR, CODENAME.lower()).glob('**/*.roff'): if not absolute_file_path.is_file(): exit('Not a file: {}'.format(absolute_file_path)) @@ -73,9 +78,8 @@ for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): except: html = '' - # Create a separate graph node for this manpage - # A separate, temporary graph for a manpage that is printed out immediately - # because I haven't got enough RAM for storing thousands of pages. + # Create a new graph for this manpage that is printed out immediately + # because I haven't got enough RAM for storing the whole graph. g_page = Graph() page_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}/{distro_package}/{filename}')) @@ -94,6 +98,9 @@ for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): print(g_page.serialize(format='nt')) + # Now we're going to create nodes for the debian distro as well as the package + # this page belongs to. + # Create a graph node for this package # Link to the page node package_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}/{distro_package}'))