home » zplus/dokk.git
Author zPlus <zplus@peers.community> 2025-01-01 20:54:38
Committer zPlus <zplus@peers.community> 2025-01-01 20:54:38
Commit 67d09af (patch)
Tree f4e677d
Parent(s)

Add Theory of Computing journal.


commits diff: 92ed601..67d09af
5 files changed, 156 insertions, 0 deletionsdownload


Diffstat
-rw-r--r-- nodes/theoryofcomputing.jsonld 11
-rw-r--r-- scripts/theoryofcomputing.org/.gitignore 4
-rw-r--r-- scripts/theoryofcomputing.org/README 27
-rw-r--r-- scripts/theoryofcomputing.org/requirements.txt 6
-rwxr-xr-x scripts/theoryofcomputing.org/toc.py 108

Diff options
View
Side
Whitespace
Context lines
Inter-hunk lines
+11/-0 A   nodes/theoryofcomputing.jsonld
index 0000000..6161d7f
old size: 0B - new size: 261B
new file mode: -rw-r--r--
@@ -0,0 +1,11 @@
1 + {
2 + "@context": {
3 + "library": "dokk:vocab:library:"
4 + },
5 + "@type": [
6 + "library:JournalArticle"
7 + ],
8 + "@id": "dokk:theoryofcomputing",
9 + "library:title": "Theory of Computing",
10 + "library:website": "https://theoryofcomputing.org"
11 + }

+4/-0 A   scripts/theoryofcomputing.org/.gitignore
index 0000000..b950c4c
old size: 0B - new size: 41B
new file mode: -rw-r--r--
@@ -0,0 +1,4 @@
1 + /nodes
2 + /pdf
3 + /theoryofcomputing.org
4 + /venv

+27/-0 A   scripts/theoryofcomputing.org/README
index 0000000..ababd72
old size: 0B - new size: 1K
new file mode: -rw-r--r--
@@ -0,0 +1,27 @@
1 + Articles are grouped by volumes, and they're indexed at https://theoryofcomputing.org/articles/main/
2 + Each article page has a link to a source.zip file containing all the info about the specific
3 + article. The idea is to download all these zip files and extract info from them.
4 +
5 + There are instructions for mirroring with rsync but they are outdated. Therefore we need
6 + to scrap the website using wget.
7 +
8 + Parsing latex from Python is a nightmare (cannot find any module, and not all papers use
9 + the same latex snippets) therefore some data is extracted from the articles' HTML pages (they
10 + use Google Scholar citation_* <meta> tags).
11 +
12 +
13 + Mirror the whole website:
14 +
15 + wget --mirror https://theoryofcomputing.org
16 +
17 +
18 + Decompress all "source.zip" archives into "source.zip.decompressed":
19 +
20 + find -type f -name "source.zip" -exec unzip -d "{}.decompressed" "{}" \;
21 +
22 +
23 + Extract data from the mirror and create the nodes:
24 +
25 + mkdir --parents pdf/theoryofcomputing.org
26 + mkdir nodes
27 + find -type d -regex ".*/articles/v[0-9][0-9][0-9]a[0-9][0-9][0-9]$" -exec ./toc.py {} \;

+6/-0 A   scripts/theoryofcomputing.org/requirements.txt
index 0000000..f6c88ce
old size: 0B - new size: 79B
new file mode: -rw-r--r--
@@ -0,0 +1,6 @@
1 + lxml
2 +
3 + # https://github.com/sciunto-org/python-bibtexparser
4 + --pre
5 + bibtexparser
6 +

+108/-0 A   scripts/theoryofcomputing.org/toc.py
index 0000000..e793b0d
old size: 0B - new size: 3K
new file mode: -rwxr-xr-x
@@ -0,0 +1,108 @@
1 + #!/usr/bin/env python3
2 +
3 + # This script expect an article folder as input, for example
4 + # ./theoryofcomputing.org/articles/v001a001
5 +
6 + import bibtexparser
7 + import json
8 + import os
9 + import re
10 + import subprocess
11 + import sys
12 + from datetime import datetime
13 + from lxml import html
14 +
15 + article_path = sys.argv[1]
16 + assert os.path.isdir(article_path)
17 +
18 + bibtex = None
19 + try:
20 + bibtex = bibtexparser.parse_file(f'{article_path}/bibtex.txt')
21 + except Exception:
22 + # Some articles do not have a bibtex file. They only contain
23 + # "forewords" for special issues.
24 + exit()
25 +
26 + bibtex_key = bibtex.entries[0].key
27 + assert re.match('^v[0-9]{3}a[0-9]{3}$', bibtex_key)
28 +
29 + # Extract data from bibtex
30 + title = bibtex.entries[0].fields_dict['title'].value
31 + authors = bibtex.entries[0].fields_dict['author'].value
32 + doi = bibtex.entries[0].fields_dict['doi'].value
33 + site = bibtex.entries[0].fields_dict['URL'].value
34 + license = None
35 +
36 + # In bibtex, authors are written as "surname, name [and ...]"
37 + # therefore we split the string to get the author names
38 + authors = [ ' '.join([ name_part.strip() for name_part in author.split(',')[::-1] ]) for author in authors.split(' and ') ]
39 +
40 +
41 + # Extract additional data from HTML using XPaths
42 + html_tree = html.parse(f'{article_path}/index.html')
43 +
44 + license_node = html_tree.findall('.//*[@id="copyright"]//a[@rel="license"]')
45 + assert len(license_node) == 1
46 + license_url = license_node[0].get('href')
47 +
48 + if license_url == 'http://creativecommons.org/licenses/by/3.0/':
49 + license = 'dokk:license:CC-BY-3.0'
50 + elif license_url == 'http://creativecommons.org/licenses/by-nd/2.0/':
51 + license = 'dokk:license:CC-BY-ND-2.0'
52 + assert license
53 +
54 + pdf = html_tree.findall('.//meta[@name="citation_pdf_url"]')
55 + assert len(pdf) == 1
56 + pdf_url = pdf[0].get('content')
57 + assert pdf_url == f'https://theoryofcomputing.org/articles/{bibtex_key}/{bibtex_key}.pdf'
58 +
59 + # Copy the PDF file of the article to the output folder
60 + pdf_source_file = pdf_url[8:]
61 + assert os.path.isfile(pdf_source_file)
62 + cp_ret = subprocess.run(['cp', pdf_source_file, './pdf/theoryofcomputing.org/'])
63 + assert cp_ret.returncode == 0 # No errors
64 +
65 + # Create the node
66 + node = {
67 + '@context': {
68 + 'library': 'dokk:vocab:library:',
69 + 'license': 'dokk:vocab:license:',
70 + 'library:journal': { '@type': '@id' },
71 + 'license:licensed_under': { '@type': '@id' },
72 + 'blob': 'dokk:vocab:blob:'
73 + },
74 + '@type': [
75 + 'library:Item',
76 + 'library:JournalArticle'
77 + ],
78 + '@id': f'dokk:theoryofcomputing_{bibtex_key}',
79 + 'library:author': authors,
80 + 'library:journal': 'dokk:theoryofcomputing',
81 + 'license:licensed_under': license,
82 + 'library:title': title,
83 + 'blob:at': {
84 + '@id': f'file:/pdf/theoryofcomputing.org/{bibtex_key}.pdf',
85 + 'blob:primary_source': pdf_url,
86 + 'blob:retrieval_date': f'{datetime.now().year}-{datetime.now().month:02d}-{datetime.now().day:02d}'
87 + }
88 + }
89 +
90 + # Save node to file
91 + with open(f'nodes/theoryofcomputing_{bibtex_key}.jsonld', 'w') as file:
92 + json.dump(node, file, indent=4, ensure_ascii=False)
93 +
94 + print(f'[done] {bibtex_key}')
95 +
96 + exit()
97 +
98 +
99 + authors = html_root.findall('.//*[@id="authorline"]//a')
100 +
101 + issn = html_root.findall('.//meta[@name="citation_issn"]')
102 + lang = html_root.findall('.//meta[@name="citation_language"]')
103 + site = html_root.findall('.//meta[@name="citation_abstract_html_url"]')
104 +
105 + assert len(authors) > 0
106 + assert len(issn) == 1
107 + assert len(lang) > 0
108 + assert len(site) == 1