Commit: 67d09afad1a9bec9bfdba02df380f1e4be895626

1

+

#!/usr/bin/env python3

2

+

3

+

# This script expect an article folder as input, for example

4

+

# ./theoryofcomputing.org/articles/v001a001

5

+

6

+

import bibtexparser

7

+

import json

8

+

import os

9

+

import re

10

+

import subprocess

11

+

import sys

12

+

from datetime import datetime

13

+

from lxml import html

14

+

15

+

article_path = sys.argv[1]

16

+

assert os.path.isdir(article_path)

17

+

18

+

bibtex = None

19

+

try:

20

+

bibtex = bibtexparser.parse_file(f'{article_path}/bibtex.txt')

21

+

except Exception:

22

+

# Some articles do not have a bibtex file. They only contain

23

+

# "forewords" for special issues.

24

+

exit()

25

+

26

+

bibtex_key = bibtex.entries[0].key

27

+

assert re.match('^v[0-9]{3}a[0-9]{3}$', bibtex_key)

28

+

29

+

# Extract data from bibtex

30

+

title = bibtex.entries[0].fields_dict['title'].value

31

+

authors = bibtex.entries[0].fields_dict['author'].value

32

+

doi = bibtex.entries[0].fields_dict['doi'].value

33

+

site = bibtex.entries[0].fields_dict['URL'].value

34

+

license = None

35

+

36

+

# In bibtex, authors are written as "surname, name [and ...]"

37

+

# therefore we split the string to get the author names

38

+

authors = [ ' '.join([ name_part.strip() for name_part in author.split(',')[::-1] ]) for author in authors.split(' and ') ]

39

+

40

+

41

+

# Extract additional data from HTML using XPaths

42

+

html_tree = html.parse(f'{article_path}/index.html')

43

+

44

+

license_node = html_tree.findall('.//*[@id="copyright"]//a[@rel="license"]')

45

+

assert len(license_node) == 1

46

+

license_url = license_node[0].get('href')

47

+

48

+

if license_url == 'http://creativecommons.org/licenses/by/3.0/':

49

+

license = 'dokk:license:CC-BY-3.0'

50

+

elif license_url == 'http://creativecommons.org/licenses/by-nd/2.0/':

51

+

license = 'dokk:license:CC-BY-ND-2.0'

52

+

assert license

53

+

54

+

pdf = html_tree.findall('.//meta[@name="citation_pdf_url"]')

55

+

assert len(pdf) == 1

56

+

pdf_url = pdf[0].get('content')

57

+

assert pdf_url == f'https://theoryofcomputing.org/articles/{bibtex_key}/{bibtex_key}.pdf'

58

+

59

+

# Copy the PDF file of the article to the output folder

60

+

pdf_source_file = pdf_url[8:]

61

+

assert os.path.isfile(pdf_source_file)

62

+

cp_ret = subprocess.run(['cp', pdf_source_file, './pdf/theoryofcomputing.org/'])

63

+

assert cp_ret.returncode == 0 # No errors

64

+

65

+

# Create the node

66

+

node = {

67

+

'@context': {

68

+

'library': 'dokk:vocab:library:',

69

+

'license': 'dokk:vocab:license:',

70

+

'library:journal': { '@type': '@id' },

71

+

'license:licensed_under': { '@type': '@id' },

72

+

'blob': 'dokk:vocab:blob:'

73

+

},

74

+

'@type': [

75

+

'library:Item',

76

+

'library:JournalArticle'

77

+

],

78

+

'@id': f'dokk:theoryofcomputing_{bibtex_key}',

79

+

'library:author': authors,

80

+

'library:journal': 'dokk:theoryofcomputing',

81

+

'license:licensed_under': license,

82

+

'library:title': title,

83

+

'blob:at': {

84

+

'@id': f'file:/pdf/theoryofcomputing.org/{bibtex_key}.pdf',

85

+

'blob:primary_source': pdf_url,

86

+

'blob:retrieval_date': f'{datetime.now().year}-{datetime.now().month:02d}-{datetime.now().day:02d}'

87

+

}

88

+

}

89

+

90

+

# Save node to file

91

+

with open(f'nodes/theoryofcomputing_{bibtex_key}.jsonld', 'w') as file:

92

+

json.dump(node, file, indent=4, ensure_ascii=False)

93

+

94

+

print(f'[done] {bibtex_key}')

95

+

96

+

exit()

97

+

98

+

99

+

authors = html_root.findall('.//*[@id="authorline"]//a')

100

+

101

+

issn = html_root.findall('.//meta[@name="citation_issn"]')

102

+

lang = html_root.findall('.//meta[@name="citation_language"]')

103

+

site = html_root.findall('.//meta[@name="citation_abstract_html_url"]')

104

+

105

+

assert len(authors) > 0

106

+

assert len(issn) == 1

107

+

assert len(lang) > 0

108

+

assert len(site) == 1

Author	zPlus <zplus@peers.community> 2025-01-01 20:54:38
Committer	zPlus <zplus@peers.community> 2025-01-01 20:54:38
Commit	67d09af (patch)
Tree	f4e677d
Parent(s)	92ed601 (diff)

-rw-r--r--	nodes/theoryofcomputing.jsonld	11
-rw-r--r--	scripts/theoryofcomputing.org/.gitignore	4
-rw-r--r--	scripts/theoryofcomputing.org/README	27
-rw-r--r--	scripts/theoryofcomputing.org/requirements.txt	6
-rwxr-xr-x	scripts/theoryofcomputing.org/toc.py	108

View	Unified Unified (raw) Side by side
Side	Normal Reverse
Whitespace	Include Ignore all Ignore amount changes Ignore at end of line
Context lines
Inter-hunk lines

		@@ -0,0 +1,11 @@
1	+	{
2	+	"@context": {
3	+	"library": "dokk:vocab:library:"
4	+	},
5	+	"@type": [
6	+	"library:JournalArticle"
7	+	],
8	+	"@id": "dokk:theoryofcomputing",
9	+	"library:title": "Theory of Computing",
10	+	"library:website": "https://theoryofcomputing.org"
11	+	}

		@@ -0,0 +1,4 @@
1	+	/nodes
2	+	/pdf
3	+	/theoryofcomputing.org
4	+	/venv

index 0000000..6161d7f
old size: 0B - new size: 261B
new file mode: -rw-r--r--

index 0000000..b950c4c
old size: 0B - new size: 41B
new file mode: -rw-r--r--

index 0000000..ababd72
old size: 0B - new size: 1K
new file mode: -rw-r--r--

		@@ -0,0 +1,27 @@
1	+	Articles are grouped by volumes, and they're indexed at https://theoryofcomputing.org/articles/main/
2	+	Each article page has a link to a source.zip file containing all the info about the specific
3	+	article. The idea is to download all these zip files and extract info from them.
4	+
5	+	There are instructions for mirroring with rsync but they are outdated. Therefore we need
6	+	to scrap the website using wget.
7	+
8	+	Parsing latex from Python is a nightmare (cannot find any module, and not all papers use
9	+	the same latex snippets) therefore some data is extracted from the articles' HTML pages (they
10	+	use Google Scholar citation_* <meta> tags).
11	+
12	+
13	+	Mirror the whole website:
14	+
15	+	wget --mirror https://theoryofcomputing.org
16	+
17	+
18	+	Decompress all "source.zip" archives into "source.zip.decompressed":
19	+
20	+	find -type f -name "source.zip" -exec unzip -d "{}.decompressed" "{}" \;
21	+
22	+
23	+	Extract data from the mirror and create the nodes:
24	+
25	+	mkdir --parents pdf/theoryofcomputing.org
26	+	mkdir nodes
27	+	find -type d -regex ".*/articles/v[0-9][0-9][0-9]a[0-9][0-9][0-9]$" -exec ./toc.py {} \;

index 0000000..f6c88ce
old size: 0B - new size: 79B
new file mode: -rw-r--r--

		@@ -0,0 +1,6 @@
1	+	lxml
2	+
3	+	# https://github.com/sciunto-org/python-bibtexparser
4	+	--pre
5	+	bibtexparser
6	+

index 0000000..e793b0d
old size: 0B - new size: 3K
new file mode: -rwxr-xr-x

		@@ -0,0 +1,108 @@
1	+	#!/usr/bin/env python3
2	+
3	+	# This script expect an article folder as input, for example
4	+	# ./theoryofcomputing.org/articles/v001a001
5	+
6	+	import bibtexparser
7	+	import json
8	+	import os
9	+	import re
10	+	import subprocess
11	+	import sys
12	+	from datetime import datetime
13	+	from lxml import html
14	+
15	+	article_path = sys.argv[1]
16	+	assert os.path.isdir(article_path)
17	+
18	+	bibtex = None
19	+	try:
20	+	bibtex = bibtexparser.parse_file(f'{article_path}/bibtex.txt')
21	+	except Exception:
22	+	# Some articles do not have a bibtex file. They only contain
23	+	# "forewords" for special issues.
24	+	exit()
25	+
26	+	bibtex_key = bibtex.entries[0].key
27	+	assert re.match('^v[0-9]{3}a[0-9]{3}$', bibtex_key)
28	+
29	+	# Extract data from bibtex
30	+	title = bibtex.entries[0].fields_dict['title'].value
31	+	authors = bibtex.entries[0].fields_dict['author'].value
32	+	doi = bibtex.entries[0].fields_dict['doi'].value
33	+	site = bibtex.entries[0].fields_dict['URL'].value
34	+	license = None
35	+
36	+	# In bibtex, authors are written as "surname, name [and ...]"
37	+	# therefore we split the string to get the author names
38	+	authors = [ ' '.join([ name_part.strip() for name_part in author.split(',')[::-1] ]) for author in authors.split(' and ') ]
39	+
40	+
41	+	# Extract additional data from HTML using XPaths
42	+	html_tree = html.parse(f'{article_path}/index.html')
43	+
44	+	license_node = html_tree.findall('.//*[@id="copyright"]//a[@rel="license"]')
45	+	assert len(license_node) == 1
46	+	license_url = license_node[0].get('href')
47	+
48	+	if license_url == 'http://creativecommons.org/licenses/by/3.0/':
49	+	license = 'dokk:license:CC-BY-3.0'
50	+	elif license_url == 'http://creativecommons.org/licenses/by-nd/2.0/':
51	+	license = 'dokk:license:CC-BY-ND-2.0'
52	+	assert license
53	+
54	+	pdf = html_tree.findall('.//meta[@name="citation_pdf_url"]')
55	+	assert len(pdf) == 1
56	+	pdf_url = pdf[0].get('content')
57	+	assert pdf_url == f'https://theoryofcomputing.org/articles/{bibtex_key}/{bibtex_key}.pdf'
58	+
59	+	# Copy the PDF file of the article to the output folder
60	+	pdf_source_file = pdf_url[8:]
61	+	assert os.path.isfile(pdf_source_file)
62	+	cp_ret = subprocess.run(['cp', pdf_source_file, './pdf/theoryofcomputing.org/'])
63	+	assert cp_ret.returncode == 0 # No errors
64	+
65	+	# Create the node
66	+	node = {
67	+	'@context': {
68	+	'library': 'dokk:vocab:library:',
69	+	'license': 'dokk:vocab:license:',
70	+	'library:journal': { '@type': '@id' },
71	+	'license:licensed_under': { '@type': '@id' },
72	+	'blob': 'dokk:vocab:blob:'
73	+	},
74	+	'@type': [
75	+	'library:Item',
76	+	'library:JournalArticle'
77	+	],
78	+	'@id': f'dokk:theoryofcomputing_{bibtex_key}',
79	+	'library:author': authors,
80	+	'library:journal': 'dokk:theoryofcomputing',
81	+	'license:licensed_under': license,
82	+	'library:title': title,
83	+	'blob:at': {
84	+	'@id': f'file:/pdf/theoryofcomputing.org/{bibtex_key}.pdf',
85	+	'blob:primary_source': pdf_url,
86	+	'blob:retrieval_date': f'{datetime.now().year}-{datetime.now().month:02d}-{datetime.now().day:02d}'
87	+	}
88	+	}
89	+
90	+	# Save node to file
91	+	with open(f'nodes/theoryofcomputing_{bibtex_key}.jsonld', 'w') as file:
92	+	json.dump(node, file, indent=4, ensure_ascii=False)
93	+
94	+	print(f'[done] {bibtex_key}')
95	+
96	+	exit()
97	+
98	+
99	+	authors = html_root.findall('.//*[@id="authorline"]//a')
100	+
101	+	issn = html_root.findall('.//meta[@name="citation_issn"]')
102	+	lang = html_root.findall('.//meta[@name="citation_language"]')
103	+	site = html_root.findall('.//meta[@name="citation_abstract_html_url"]')
104	+
105	+	assert len(authors) > 0
106	+	assert len(issn) == 1
107	+	assert len(lang) > 0
108	+	assert len(site) == 1