Author | zPlus <zplus@peers.community> 2023-07-08 05:04:33 |
Committer | zPlus <zplus@peers.community> 2023-07-08 05:04:33 |
Commit | 66e0802 (patch) |
Tree | adbadf1 |
Parent(s) |
-rw-r--r-- | README | 22 | ||
-rwxr-xr-x | scripts/Makefile | 35 | ||
-rwxr-xr-x | scripts/rdf.py | 89 |
index 0000000..f52f445 | |||
old size: 0B - new size: 638B | |||
new file mode: -rw-r--r-- |
@@ -0,0 +1,22 @@ | |||
1 | + | Software prerequisites: apt-cacher-ng debiman mandoc python3 | |
2 | + | ||
3 | + | After installing apt-cacher-ng, the proxy is started automatically and should be | |
4 | + | listening on port 3142. | |
5 | + | debiman by default will download from http://localhost:3142/deb.debian.org/ | |
6 | + | This setting can be changed with -local_mirror or -remote_mirror | |
7 | + | ||
8 | + | Download manpages from Debian: | |
9 | + | ||
10 | + | $ DEBIMAN_SERVING_DIR=/absolue/path make download | |
11 | + | ||
12 | + | Extract all the pages from .gz: | |
13 | + | ||
14 | + | $ DEBIMAN_SERVING_DIR=/absolue/path make download | |
15 | + | ||
16 | + | Convert manpages to plaintext: | |
17 | + | ||
18 | + | $ DEBIMAN_SERVING_DIR=/absolue/path make convert | |
19 | + | ||
20 | + | Create RDF graph: | |
21 | + | ||
22 | + | $ DEBIMAN_SERVING_DIR=/absolue/path ./rdf.py |
index 0000000..12f02b2 | |||
old size: 0B - new size: 1K | |||
new file mode: -rwxr-xr-x |
@@ -0,0 +1,35 @@ | |||
1 | + | SHELL = /bin/bash | |
2 | + | DEBIMAN_SERVING_DIR ?= ./ | |
3 | + | ||
4 | + | ||
5 | + | # This just prints out variables for displaying | |
6 | + | vars: | |
7 | + | @echo "Using envvar DEBIMAN_SERVING_DIR = ${DEBIMAN_SERVING_DIR}" | |
8 | + | ||
9 | + | ||
10 | + | # Note: the behaviour of debiman is to download all the manpages, then render them all. | |
11 | + | # This cannot be changed. Since we're only interested in the raw manpages and not | |
12 | + | # the HTML output, -only_render_pkgs is a hack that will make debiman render only one | |
13 | + | # page (0ad) and quit. | |
14 | + | download: vars | |
15 | + | debiman -sync_codenames="bookworm" -serving_dir="${DEBIMAN_SERVING_DIR}" -only_render_pkgs="0ad" | |
16 | + | ||
17 | + | # Extract downloaded pages since they're compressed by default | |
18 | + | extract: vars | |
19 | + | find "${DEBIMAN_SERVING_DIR}" -type f,l -name "*.gz" -exec gunzip --decompress --force --keep "{}" \; | |
20 | + | ||
21 | + | # Convert manpages from roff to plaintext | |
22 | + | # Manpage files are named "page.section.lang". | |
23 | + | # TODO ! -name "stress-ng.1.en" ! -name "md.4.en" | |
24 | + | # this is a hack for skipping the rendering of those pages. The version of mandoc | |
25 | + | # in Debian is outdated and gets stuck in a infinite loop. Remove this hack if | |
26 | + | # using a more recent mandoc. | |
27 | + | convert: vars | |
28 | + | while IFS= read -r file; do \ | |
29 | + | echo "$${file}"; \ | |
30 | + | cp "$${file}" "$${file}.roff"; \ | |
31 | + | mandoc -T utf8 "$${file}" > "$${file}.roff.txt"; \ | |
32 | + | mandoc -T html -O fragment "$${file}" > "$${file}.roff.html"; \ | |
33 | + | done < <( find "${DEBIMAN_SERVING_DIR}" -type f -name "*.*.*" ! -name "*.gz" ! -name "*.roff" ! -name "*.txt" ! -name "*.html" ! -name "stress-ng.1.en" ! -name "md.4.en" ) | |
34 | + | ||
35 | + | .PHONY: vars download extract |
index 0000000..8763df6 | |||
old size: 0B - new size: 3K | |||
new file mode: -rwxr-xr-x |
@@ -0,0 +1,89 @@ | |||
1 | + | #!/usr/bin/env python3 | |
2 | + | ||
3 | + | import os | |
4 | + | import pathlib | |
5 | + | import rdflib | |
6 | + | import sys | |
7 | + | ||
8 | + | from rdflib import BNode, Graph, Literal, Namespace, URIRef | |
9 | + | from rdflib.namespace import RDF | |
10 | + | ||
11 | + | DEBIMAN_SERVING_DIR = os.getenv('DEBIMAN_SERVING_DIR') | |
12 | + | ||
13 | + | if not DEBIMAN_SERVING_DIR: | |
14 | + | print('envvar DEBIMAN_SERVING_DIR is not defined') | |
15 | + | exit() | |
16 | + | ||
17 | + | for absolute_file_path in sorted(pathlib.Path(DEBIMAN_SERVING_DIR).glob('**/*.roff')): | |
18 | + | if not absolute_file_path.is_file(): | |
19 | + | print('Not a file: {}'.format(absolute_file_path)) | |
20 | + | continue | |
21 | + | ||
22 | + | # Remove the DEBIMAN_SERVING_DIR prefix from the path | |
23 | + | # Also remove .roff suffix from the filename | |
24 | + | file = absolute_file_path.relative_to(DEBIMAN_SERVING_DIR).with_suffix('') | |
25 | + | file_parts = list(file.parts) | |
26 | + | ||
27 | + | # Replace debian codenames with version numbers | |
28 | + | if file_parts[0] == 'debian': | |
29 | + | file_parts[1] = file_parts[1].replace('bookworm', '12') | |
30 | + | file = pathlib.Path(*file_parts) | |
31 | + | ||
32 | + | name, section, language = file.name.rsplit('.', 2) | |
33 | + | section_number, subsection = int(section[:1]), section[1:] | |
34 | + | ||
35 | + | # Fix characters that cannot be used as valid URIs | |
36 | + | id = str(file).replace(' ', '_') \ | |
37 | + | .replace('[', '%5B') \ | |
38 | + | .replace(']', '%5D') \ | |
39 | + | .replace('#', '%23') | |
40 | + | ||
41 | + | # Read files | |
42 | + | try: | |
43 | + | with open(absolute_file_path, 'r') as f: | |
44 | + | roff = f.read() | |
45 | + | except: | |
46 | + | roff = '' | |
47 | + | ||
48 | + | try: | |
49 | + | with open(f'{absolute_file_path}.txt', 'r') as f: | |
50 | + | plaintext = f.read() | |
51 | + | except: | |
52 | + | plaintext = '' | |
53 | + | ||
54 | + | try: | |
55 | + | with open(f'{absolute_file_path}.html', 'r') as f: | |
56 | + | html = f.read() | |
57 | + | except: | |
58 | + | html = '' | |
59 | + | ||
60 | + | MANPAGE = Namespace('dokk:manpages:') | |
61 | + | ||
62 | + | turtle = f""" | |
63 | + | {URIRef(MANPAGE[id]).n3()} | |
64 | + | {URIRef(MANPAGE.name).n3()} {Literal(name).n3()} ; | |
65 | + | {URIRef(MANPAGE.name_lowercase).n3()} {Literal(name.lower()).n3()} ; | |
66 | + | {URIRef(MANPAGE.section).n3()} {Literal(section).n3()} ; | |
67 | + | {URIRef(MANPAGE.section_lowercase).n3()} {Literal(section.lower()).n3()} ; | |
68 | + | {URIRef(MANPAGE.section_number).n3()} {Literal(section_number).n3()} ; | |
69 | + | {URIRef(MANPAGE.subsection).n3()} {Literal(subsection).n3()} ; | |
70 | + | {URIRef(MANPAGE.language).n3()} {Literal(language).n3()} ; | |
71 | + | {URIRef(MANPAGE.roff).n3()} {Literal(roff).n3()} ; | |
72 | + | {URIRef(MANPAGE.plaintext).n3()} {Literal(plaintext).n3()} ; | |
73 | + | {URIRef(MANPAGE.html).n3()} {Literal(html).n3()} ; | |
74 | + | """ | |
75 | + | ||
76 | + | if file_parts[0] == 'debian': | |
77 | + | turtle += f""" | |
78 | + | {URIRef(MANPAGE.source).n3()} [ | |
79 | + | {URIRef(MANPAGE.distribution_name).n3()} {Literal(file_parts[0]).n3()} ; | |
80 | + | {URIRef(MANPAGE.distribution_version).n3()} {Literal(file_parts[1]).n3()} ; | |
81 | + | {URIRef(MANPAGE.package).n3()} {Literal(file_parts[2]).n3()} ; | |
82 | + | {URIRef(MANPAGE.filename).n3()} {Literal(file.name).n3()} ; | |
83 | + | ] . | |
84 | + | """ | |
85 | + | ||
86 | + | mangraph = Graph().parse(publicID='', format='turtle', data=turtle) | |
87 | + | ||
88 | + | triples = mangraph.serialize(destination=None, format='nt') | |
89 | + | sys.stdout.write(triples) |