Author | zPlus <zplus@peers.community> 2024-06-20 07:16:03 |
Committer | zPlus <zplus@peers.community> 2024-06-20 07:16:03 |
Commit | fda9a5e (patch) |
Tree | f1f873a |
Parent(s) |
-rw-r--r-- | README | 21 | ||
-rwxr-xr-x | scripts/Makefile | 20 | ||
-rwxr-xr-x | scripts/rdf.py | 15 |
index b9e9121..9bc1022 | |||
old size: 695B - new size: 1004B | |||
@@ -1,24 +1,33 @@ | |||
1 | - | Software prerequisites: apt-cacher-ng debiman mandoc python3 | |
1 | + | Software prerequisites: | |
2 | + | - apt-cacher-ng | |
3 | + | - debiman (install from repository, follow instructions in README) | |
4 | + | - mandoc | |
5 | + | - python3 | |
6 | + | - python3-venv | |
2 | 7 | ||
3 | 8 | After installing apt-cacher-ng, the proxy is started automatically and should be | |
4 | 9 | listening on port 3142. | |
5 | 10 | debiman by default will download from http://localhost:3142/deb.debian.org/ | |
6 | - | This setting can be changed with -local_mirror or -remote_mirror | |
11 | + | This option can be changed with -local_mirror or -remote_mirror. Non-LTS Debians are | |
12 | + | removed from the mirrors; use the archive instead: https://archive.debian.org | |
7 | 13 | ||
8 | 14 | How to use the scripts: | |
9 | 15 | ||
10 | 16 | 1. Download manpages from Debian: | |
11 | 17 | ||
12 | - | $ DEBIMAN_SERVING_DIR=/absolue/path make download | |
18 | + | $ MIRROR="https://archive.debian.org" CODENAME="buster" DEBIMAN_SERVING_DIR=/path make download | |
13 | 19 | ||
14 | 20 | 2. Extract all the pages from .gz: | |
15 | 21 | ||
16 | - | $ DEBIMAN_SERVING_DIR=/absolue/path make extract | |
22 | + | $ DEBIMAN_SERVING_DIR=/path make extract | |
17 | 23 | ||
18 | 24 | 3. Convert manpages to plaintext and html: | |
19 | 25 | ||
20 | - | $ DEBIMAN_SERVING_DIR=/absolue/path make convert | |
26 | + | $ DEBIMAN_SERVING_DIR=/path make convert | |
21 | 27 | ||
22 | 28 | 4. Create RDF graph: | |
23 | 29 | ||
24 | - | $ DEBIMAN_SERVING_DIR=/absolue/path ./rdf.py | |
30 | + | $ python3 -m venv venv | |
31 | + | $ source venv/bin/activate | |
32 | + | $ pip install rdflib | |
33 | + | $ CODENAME="buster" DEBIMAN_SERVING_DIR=/path ./rdf.py |
index 3d83416..cc4485f | |||
old size: 2K - new size: 2K | |||
@@ -1,31 +1,35 @@ | |||
1 | 1 | SHELL = /bin/bash | |
2 | 2 | ||
3 | - | # THIS PATH MUST BE ABSOLUTE. | |
3 | + | # The mirror to download from | |
4 | + | MIRROR ?= | |
5 | + | ||
4 | 6 | # The path where debiman will save files. | |
5 | - | # There is a bug in debiman, it won't accept a relative path | |
6 | - | DEBIMAN_SERVING_DIR ?= ./ | |
7 | + | DEBIMAN_SERVING_DIR ?= | |
7 | 8 | ||
9 | + | # Name of the distribution to be processed | |
10 | + | CODENAME ?= | |
8 | 11 | ||
9 | 12 | # This just prints out variables for displaying | |
10 | 13 | vars: | |
11 | - | @echo "Using envvar DEBIMAN_SERVING_DIR = ${DEBIMAN_SERVING_DIR}" | |
12 | - | @echo "If you get errors, this path MUST be absolute." | |
14 | + | @echo "MIRROR=${MIRROR}" | |
15 | + | @echo "CODENAME=${CODENAME}" | |
16 | + | @echo "DEBIMAN_SERVING_DIR=${DEBIMAN_SERVING_DIR}" | |
13 | 17 | ||
14 | 18 | # Note: the behaviour of debiman is to download all the manpages, then render them all. | |
15 | 19 | # This cannot be changed. Since we're only interested in the raw manpages and not | |
16 | 20 | # the HTML output, -only_render_pkgs is a hack that will make debiman render only one | |
17 | 21 | # page (0ad) and quit. | |
18 | 22 | download: vars | |
19 | - | debiman -sync_codenames="buster,bullseye,bookworm" -sync_suites= -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad" | |
23 | + | debiman -remote_mirror="${MIRROR}" -sync_codenames="${CODENAME}" -sync_suites= -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad" | |
20 | 24 | ||
21 | 25 | # Extract downloaded pages since they're compressed by default | |
22 | 26 | extract: vars | |
23 | 27 | find "${DEBIMAN_SERVING_DIR}" -type f,l -name "*.gz" -exec gunzip --decompress --force --keep "{}" \; | |
24 | 28 | ||
25 | - | # Convert manpages from roff to plaintext | |
29 | + | # Convert manpages from roff to other formats | |
26 | 30 | # Manpage files are named "page.section.lang". | |
27 | 31 | # TODO ! -name "stress-ng.1.en" ! -name "md.4.en" | |
28 | - | # this is a hack for skipping the rendering of those pages. The version of mandoc | |
32 | + | # this is a hack for skipping the parsing of those pages. The version of mandoc | |
29 | 33 | # in Debian is outdated and gets stuck in a infinite loop. Remove this hack if | |
30 | 34 | # using a more recent mandoc. | |
31 | 35 | convert: vars |
index f31daf9..3db52db | |||
old size: 4K - new size: 5K | |||
@@ -8,8 +8,13 @@ import sys | |||
8 | 8 | from rdflib import BNode, Graph, Literal, Namespace, URIRef | |
9 | 9 | from rdflib.namespace import RDF | |
10 | 10 | ||
11 | + | CODENAME = os.getenv('CODENAME') | |
11 | 12 | DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR') | |
12 | 13 | ||
14 | + | if not CODENAME: | |
15 | + | print('envvar CODENAME is not defined') | |
16 | + | exit() | |
17 | + | ||
13 | 18 | if not DEBIMAN_SERVING_DIR: | |
14 | 19 | print('envvar DEBIMAN_SERVING_DIR is not defined') | |
15 | 20 | exit() | |
@@ -34,7 +39,7 @@ def percent_encode(string): | |||
34 | 39 | .replace('[', '%5B') \ | |
35 | 40 | .replace(']', '%5D') | |
36 | 41 | ||
37 | - | for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): | |
42 | + | for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR, CODENAME.lower()).glob('**/*.roff'): | |
38 | 43 | if not absolute_file_path.is_file(): | |
39 | 44 | exit('Not a file: {}'.format(absolute_file_path)) | |
40 | 45 | ||
@@ -73,9 +78,8 @@ for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): | |||
73 | 78 | except: | |
74 | 79 | html = '' | |
75 | 80 | ||
76 | - | # Create a separate graph node for this manpage | |
77 | - | # A separate, temporary graph for a manpage that is printed out immediately | |
78 | - | # because I haven't got enough RAM for storing thousands of pages. | |
81 | + | # Create a new graph for this manpage that is printed out immediately | |
82 | + | # because I haven't got enough RAM for storing the whole graph. | |
79 | 83 | g_page = Graph() | |
80 | 84 | ||
81 | 85 | page_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}/{distro_package}/{filename}')) | |
@@ -94,6 +98,9 @@ for absolute_file_path in pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff'): | |||
94 | 98 | ||
95 | 99 | print(g_page.serialize(format='nt')) | |
96 | 100 | ||
101 | + | # Now we're going to create nodes for the debian distro as well as the package | |
102 | + | # this page belongs to. | |
103 | + | ||
97 | 104 | # Create a graph node for this package | |
98 | 105 | # Link to the page node | |
99 | 106 | package_ref = URIRef(percent_encode(f'dokk:manpages:debian/{distro_number}/{distro_package}')) |