home » dokk/manpages.git
Author zPlus <zplus@peers.community> 2023-07-08 05:04:33
Committer zPlus <zplus@peers.community> 2023-07-08 05:04:33
Commit 66e0802 (patch)
Tree adbadf1
Parent(s)

Initial commit


commits diff: 0000000..66e0802
3 files changed, 146 insertions, 0 deletionsdownload


Diffstat
-rw-r--r-- README 22
-rwxr-xr-x scripts/Makefile 35
-rwxr-xr-x scripts/rdf.py 89

Diff options
View
Side
Whitespace
Context lines
Inter-hunk lines
+22/-0 A   README
index 0000000..f52f445
old size: 0B - new size: 638B
new file mode: -rw-r--r--
@@ -0,0 +1,22 @@
1 + Software prerequisites: apt-cacher-ng debiman mandoc python3
2 +
3 + After installing apt-cacher-ng, the proxy is started automatically and should be
4 + listening on port 3142.
5 + debiman by default will download from http://localhost:3142/deb.debian.org/
6 + This setting can be changed with -local_mirror or -remote_mirror
7 +
8 + Download manpages from Debian:
9 +
10 + $ DEBIMAN_SERVING_DIR=/absolue/path make download
11 +
12 + Extract all the pages from .gz:
13 +
14 + $ DEBIMAN_SERVING_DIR=/absolue/path make download
15 +
16 + Convert manpages to plaintext:
17 +
18 + $ DEBIMAN_SERVING_DIR=/absolue/path make convert
19 +
20 + Create RDF graph:
21 +
22 + $ DEBIMAN_SERVING_DIR=/absolue/path ./rdf.py

+35/-0 A   scripts/Makefile
index 0000000..12f02b2
old size: 0B - new size: 1K
new file mode: -rwxr-xr-x
@@ -0,0 +1,35 @@
1 + SHELL = /bin/bash
2 + DEBIMAN_SERVING_DIR ?= ./
3 +
4 +
5 + # This just prints out variables for displaying
6 + vars:
7 + @echo "Using envvar DEBIMAN_SERVING_DIR = ${DEBIMAN_SERVING_DIR}"
8 +
9 +
10 + # Note: the behaviour of debiman is to download all the manpages, then render them all.
11 + # This cannot be changed. Since we're only interested in the raw manpages and not
12 + # the HTML output, -only_render_pkgs is a hack that will make debiman render only one
13 + # page (0ad) and quit.
14 + download: vars
15 + debiman -sync_codenames="bookworm" -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad"
16 +
17 + # Extract downloaded pages since they're compressed by default
18 + extract: vars
19 + find "${DEBIMAN_SERVING_DIR}" -type f,l -name "*.gz" -exec gunzip --decompress --force --keep "{}" \;
20 +
21 + # Convert manpages from roff to plaintext
22 + # Manpage files are named "page.section.lang".
23 + # TODO ! -name "stress-ng.1.en" ! -name "md.4.en"
24 + # this is a hack for skipping the rendering of those pages. The version of mandoc
25 + # in Debian is outdated and gets stuck in a infinite loop. Remove this hack if
26 + # using a more recent mandoc.
27 + convert: vars
28 + while IFS= read -r file; do \
29 + echo "$${file}"; \
30 + cp "$${file}" "$${file}.roff"; \
31 + mandoc -T utf8 "$${file}" > "$${file}.roff.txt"; \
32 + mandoc -T html -O fragment "$${file}" > "$${file}.roff.html"; \
33 + done < <( find "${DEBIMAN_SERVING_DIR}" -type f -name "*.*.*" ! -name "*.gz" ! -name "*.roff" ! -name "*.txt" ! -name "*.html" ! -name "stress-ng.1.en" ! -name "md.4.en" )
34 +
35 + .PHONY: vars download extract

+89/-0 A   scripts/rdf.py
index 0000000..8763df6
old size: 0B - new size: 3K
new file mode: -rwxr-xr-x
@@ -0,0 +1,89 @@
1 + #!/usr/bin/env python3
2 +
3 + import os
4 + import pathlib
5 + import rdflib
6 + import sys
7 +
8 + from rdflib import BNode, Graph, Literal, Namespace, URIRef
9 + from rdflib.namespace import RDF
10 +
11 + DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR')
12 +
13 + if not DEBIMAN_SERVING_DIR:
14 + print('envvar DEBIMAN_SERVING_DIR is not defined')
15 + exit()
16 +
17 + for absolute_file_path in sorted(pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff')):
18 + if not absolute_file_path.is_file():
19 + print('Not a file: {}'.format(absolute_file_path))
20 + continue
21 +
22 + # Remove the DEBIMAN_SERVING_DIR prefix from the path
23 + # Also remove .roff suffix from the filename
24 + file = absolute_file_path.relative_to(DEBIMAN_SERVING_DIR).with_suffix('')
25 + file_parts = list(file.parts)
26 +
27 + # Replace debian codenames with version numbers
28 + if file_parts[0] == 'debian':
29 + file_parts[1] = file_parts[1].replace('bookworm', '12')
30 + file = pathlib.Path(*file_parts)
31 +
32 + name, section, language = file.name.rsplit('.', 2)
33 + section_number, subsection = int(section[:1]), section[1:]
34 +
35 + # Fix characters that cannot be used as valid URIs
36 + id = str(file).replace(' ', '_') \
37 + .replace('[', '%5B') \
38 + .replace(']', '%5D') \
39 + .replace('#', '%23')
40 +
41 + # Read files
42 + try:
43 + with open(absolute_file_path, 'r') as f:
44 + roff = f.read()
45 + except:
46 + roff = ''
47 +
48 + try:
49 + with open(f'{absolute_file_path}.txt', 'r') as f:
50 + plaintext = f.read()
51 + except:
52 + plaintext = ''
53 +
54 + try:
55 + with open(f'{absolute_file_path}.html', 'r') as f:
56 + html = f.read()
57 + except:
58 + html = ''
59 +
60 + MANPAGE = Namespace('dokk:manpages:')
61 +
62 + turtle = f"""
63 + {URIRef(MANPAGE[id]).n3()}
64 + {URIRef(MANPAGE.name).n3()} {Literal(name).n3()} ;
65 + {URIRef(MANPAGE.name_lowercase).n3()} {Literal(name.lower()).n3()} ;
66 + {URIRef(MANPAGE.section).n3()} {Literal(section).n3()} ;
67 + {URIRef(MANPAGE.section_lowercase).n3()} {Literal(section.lower()).n3()} ;
68 + {URIRef(MANPAGE.section_number).n3()} {Literal(section_number).n3()} ;
69 + {URIRef(MANPAGE.subsection).n3()} {Literal(subsection).n3()} ;
70 + {URIRef(MANPAGE.language).n3()} {Literal(language).n3()} ;
71 + {URIRef(MANPAGE.roff).n3()} {Literal(roff).n3()} ;
72 + {URIRef(MANPAGE.plaintext).n3()} {Literal(plaintext).n3()} ;
73 + {URIRef(MANPAGE.html).n3()} {Literal(html).n3()} ;
74 + """
75 +
76 + if file_parts[0] == 'debian':
77 + turtle += f"""
78 + {URIRef(MANPAGE.source).n3()} [
79 + {URIRef(MANPAGE.distribution_name).n3()} {Literal(file_parts[0]).n3()} ;
80 + {URIRef(MANPAGE.distribution_version).n3()} {Literal(file_parts[1]).n3()} ;
81 + {URIRef(MANPAGE.package).n3()} {Literal(file_parts[2]).n3()} ;
82 + {URIRef(MANPAGE.filename).n3()} {Literal(file.name).n3()} ;
83 + ] .
84 + """
85 +
86 + mangraph = Graph().parse(publicID='', format='turtle', data=turtle)
87 +
88 + triples = mangraph.serialize(destination=None, format='nt')
89 + sys.stdout.write(triples)