|
1
|
+ |
#!/usr/bin/env python3
|
|
2
|
+ |
|
|
3
|
+ |
# This script expect an article folder as input, for example
|
|
4
|
+ |
# ./theoryofcomputing.org/articles/v001a001
|
|
5
|
+ |
|
|
6
|
+ |
import bibtexparser
|
|
7
|
+ |
import json
|
|
8
|
+ |
import os
|
|
9
|
+ |
import re
|
|
10
|
+ |
import subprocess
|
|
11
|
+ |
import sys
|
|
12
|
+ |
from datetime import datetime
|
|
13
|
+ |
from lxml import html
|
|
14
|
+ |
|
|
15
|
+ |
article_path = sys.argv[1]
|
|
16
|
+ |
assert os.path.isdir(article_path)
|
|
17
|
+ |
|
|
18
|
+ |
bibtex = None
|
|
19
|
+ |
try:
|
|
20
|
+ |
bibtex = bibtexparser.parse_file(f'{article_path}/bibtex.txt')
|
|
21
|
+ |
except Exception:
|
|
22
|
+ |
# Some articles do not have a bibtex file. They only contain
|
|
23
|
+ |
# "forewords" for special issues.
|
|
24
|
+ |
exit()
|
|
25
|
+ |
|
|
26
|
+ |
bibtex_key = bibtex.entries[0].key
|
|
27
|
+ |
assert re.match('^v[0-9]{3}a[0-9]{3}$', bibtex_key)
|
|
28
|
+ |
|
|
29
|
+ |
# Extract data from bibtex
|
|
30
|
+ |
title = bibtex.entries[0].fields_dict['title'].value
|
|
31
|
+ |
authors = bibtex.entries[0].fields_dict['author'].value
|
|
32
|
+ |
doi = bibtex.entries[0].fields_dict['doi'].value
|
|
33
|
+ |
site = bibtex.entries[0].fields_dict['URL'].value
|
|
34
|
+ |
license = None
|
|
35
|
+ |
|
|
36
|
+ |
# In bibtex, authors are written as "surname, name [and ...]"
|
|
37
|
+ |
# therefore we split the string to get the author names
|
|
38
|
+ |
authors = [ ' '.join([ name_part.strip() for name_part in author.split(',')[::-1] ]) for author in authors.split(' and ') ]
|
|
39
|
+ |
|
|
40
|
+ |
|
|
41
|
+ |
# Extract additional data from HTML using XPaths
|
|
42
|
+ |
html_tree = html.parse(f'{article_path}/index.html')
|
|
43
|
+ |
|
|
44
|
+ |
license_node = html_tree.findall('.//*[@id="copyright"]//a[@rel="license"]')
|
|
45
|
+ |
assert len(license_node) == 1
|
|
46
|
+ |
license_url = license_node[0].get('href')
|
|
47
|
+ |
|
|
48
|
+ |
if license_url == 'http://creativecommons.org/licenses/by/3.0/':
|
|
49
|
+ |
license = 'dokk:license:CC-BY-3.0'
|
|
50
|
+ |
elif license_url == 'http://creativecommons.org/licenses/by-nd/2.0/':
|
|
51
|
+ |
license = 'dokk:license:CC-BY-ND-2.0'
|
|
52
|
+ |
assert license
|
|
53
|
+ |
|
|
54
|
+ |
pdf = html_tree.findall('.//meta[@name="citation_pdf_url"]')
|
|
55
|
+ |
assert len(pdf) == 1
|
|
56
|
+ |
pdf_url = pdf[0].get('content')
|
|
57
|
+ |
assert pdf_url == f'https://theoryofcomputing.org/articles/{bibtex_key}/{bibtex_key}.pdf'
|
|
58
|
+ |
|
|
59
|
+ |
# Copy the PDF file of the article to the output folder
|
|
60
|
+ |
pdf_source_file = pdf_url[8:]
|
|
61
|
+ |
assert os.path.isfile(pdf_source_file)
|
|
62
|
+ |
cp_ret = subprocess.run(['cp', pdf_source_file, './pdf/theoryofcomputing.org/'])
|
|
63
|
+ |
assert cp_ret.returncode == 0 # No errors
|
|
64
|
+ |
|
|
65
|
+ |
# Create the node
|
|
66
|
+ |
node = {
|
|
67
|
+ |
'@context': {
|
|
68
|
+ |
'library': 'dokk:vocab:library:',
|
|
69
|
+ |
'license': 'dokk:vocab:license:',
|
|
70
|
+ |
'library:journal': { '@type': '@id' },
|
|
71
|
+ |
'license:licensed_under': { '@type': '@id' },
|
|
72
|
+ |
'blob': 'dokk:vocab:blob:'
|
|
73
|
+ |
},
|
|
74
|
+ |
'@type': [
|
|
75
|
+ |
'library:Item',
|
|
76
|
+ |
'library:JournalArticle'
|
|
77
|
+ |
],
|
|
78
|
+ |
'@id': f'dokk:theoryofcomputing_{bibtex_key}',
|
|
79
|
+ |
'library:author': authors,
|
|
80
|
+ |
'library:journal': 'dokk:theoryofcomputing',
|
|
81
|
+ |
'license:licensed_under': license,
|
|
82
|
+ |
'library:title': title,
|
|
83
|
+ |
'blob:at': {
|
|
84
|
+ |
'@id': f'file:/pdf/theoryofcomputing.org/{bibtex_key}.pdf',
|
|
85
|
+ |
'blob:primary_source': pdf_url,
|
|
86
|
+ |
'blob:retrieval_date': f'{datetime.now().year}-{datetime.now().month:02d}-{datetime.now().day:02d}'
|
|
87
|
+ |
}
|
|
88
|
+ |
}
|
|
89
|
+ |
|
|
90
|
+ |
# Save node to file
|
|
91
|
+ |
with open(f'nodes/theoryofcomputing_{bibtex_key}.jsonld', 'w') as file:
|
|
92
|
+ |
json.dump(node, file, indent=4, ensure_ascii=False)
|
|
93
|
+ |
|
|
94
|
+ |
print(f'[done] {bibtex_key}')
|
|
95
|
+ |
|
|
96
|
+ |
exit()
|
|
97
|
+ |
|
|
98
|
+ |
|
|
99
|
+ |
authors = html_root.findall('.//*[@id="authorline"]//a')
|
|
100
|
+ |
|
|
101
|
+ |
issn = html_root.findall('.//meta[@name="citation_issn"]')
|
|
102
|
+ |
lang = html_root.findall('.//meta[@name="citation_language"]')
|
|
103
|
+ |
site = html_root.findall('.//meta[@name="citation_abstract_html_url"]')
|
|
104
|
+ |
|
|
105
|
+ |
assert len(authors) > 0
|
|
106
|
+ |
assert len(issn) == 1
|
|
107
|
+ |
assert len(lang) > 0
|
|
108
|
+ |
assert len(site) == 1
|