diff --git a/nodes/theoryofcomputing.jsonld b/nodes/theoryofcomputing.jsonld new file mode 100644 index 0000000..6161d7f --- /dev/null +++ b/nodes/theoryofcomputing.jsonld @@ -0,0 +1,11 @@ +{ + "@context": { + "library": "dokk:vocab:library:" + }, + "@type": [ + "library:JournalArticle" + ], + "@id": "dokk:theoryofcomputing", + "library:title": "Theory of Computing", + "library:website": "https://theoryofcomputing.org" +} diff --git a/scripts/theoryofcomputing.org/.gitignore b/scripts/theoryofcomputing.org/.gitignore new file mode 100644 index 0000000..b950c4c --- /dev/null +++ b/scripts/theoryofcomputing.org/.gitignore @@ -0,0 +1,4 @@ +/nodes +/pdf +/theoryofcomputing.org +/venv diff --git a/scripts/theoryofcomputing.org/README b/scripts/theoryofcomputing.org/README new file mode 100644 index 0000000..ababd72 --- /dev/null +++ b/scripts/theoryofcomputing.org/README @@ -0,0 +1,27 @@ +Articles are grouped by volumes, and they're indexed at https://theoryofcomputing.org/articles/main/ +Each article page has a link to a source.zip file containing all the info about the specific +article. The idea is to download all these zip files and extract info from them. + +There are instructions for mirroring with rsync but they are outdated. Therefore we need +to scrap the website using wget. + +Parsing latex from Python is a nightmare (cannot find any module, and not all papers use +the same latex snippets) therefore some data is extracted from the articles' HTML pages (they +use Google Scholar citation_* tags). + + +Mirror the whole website: + + wget --mirror https://theoryofcomputing.org + + +Decompress all "source.zip" archives into "source.zip.decompressed": + + find -type f -name "source.zip" -exec unzip -d "{}.decompressed" "{}" \; + + +Extract data from the mirror and create the nodes: + + mkdir --parents pdf/theoryofcomputing.org + mkdir nodes + find -type d -regex ".*/articles/v[0-9][0-9][0-9]a[0-9][0-9][0-9]$" -exec ./toc.py {} \; diff --git a/scripts/theoryofcomputing.org/requirements.txt b/scripts/theoryofcomputing.org/requirements.txt new file mode 100644 index 0000000..f6c88ce --- /dev/null +++ b/scripts/theoryofcomputing.org/requirements.txt @@ -0,0 +1,6 @@ +lxml + +# https://github.com/sciunto-org/python-bibtexparser +--pre +bibtexparser + diff --git a/scripts/theoryofcomputing.org/toc.py b/scripts/theoryofcomputing.org/toc.py new file mode 100755 index 0000000..e793b0d --- /dev/null +++ b/scripts/theoryofcomputing.org/toc.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# This script expect an article folder as input, for example +# ./theoryofcomputing.org/articles/v001a001 + +import bibtexparser +import json +import os +import re +import subprocess +import sys +from datetime import datetime +from lxml import html + +article_path = sys.argv[1] +assert os.path.isdir(article_path) + +bibtex = None +try: + bibtex = bibtexparser.parse_file(f'{article_path}/bibtex.txt') +except Exception: + # Some articles do not have a bibtex file. They only contain + # "forewords" for special issues. + exit() + +bibtex_key = bibtex.entries[0].key +assert re.match('^v[0-9]{3}a[0-9]{3}$', bibtex_key) + +# Extract data from bibtex +title = bibtex.entries[0].fields_dict['title'].value +authors = bibtex.entries[0].fields_dict['author'].value +doi = bibtex.entries[0].fields_dict['doi'].value +site = bibtex.entries[0].fields_dict['URL'].value +license = None + +# In bibtex, authors are written as "surname, name [and ...]" +# therefore we split the string to get the author names +authors = [ ' '.join([ name_part.strip() for name_part in author.split(',')[::-1] ]) for author in authors.split(' and ') ] + + +# Extract additional data from HTML using XPaths +html_tree = html.parse(f'{article_path}/index.html') + +license_node = html_tree.findall('.//*[@id="copyright"]//a[@rel="license"]') +assert len(license_node) == 1 +license_url = license_node[0].get('href') + +if license_url == 'http://creativecommons.org/licenses/by/3.0/': + license = 'dokk:license:CC-BY-3.0' +elif license_url == 'http://creativecommons.org/licenses/by-nd/2.0/': + license = 'dokk:license:CC-BY-ND-2.0' +assert license + +pdf = html_tree.findall('.//meta[@name="citation_pdf_url"]') +assert len(pdf) == 1 +pdf_url = pdf[0].get('content') +assert pdf_url == f'https://theoryofcomputing.org/articles/{bibtex_key}/{bibtex_key}.pdf' + +# Copy the PDF file of the article to the output folder +pdf_source_file = pdf_url[8:] +assert os.path.isfile(pdf_source_file) +cp_ret = subprocess.run(['cp', pdf_source_file, './pdf/theoryofcomputing.org/']) +assert cp_ret.returncode == 0 # No errors + +# Create the node +node = { + '@context': { + 'library': 'dokk:vocab:library:', + 'license': 'dokk:vocab:license:', + 'library:journal': { '@type': '@id' }, + 'license:licensed_under': { '@type': '@id' }, + 'blob': 'dokk:vocab:blob:' + }, + '@type': [ + 'library:Item', + 'library:JournalArticle' + ], + '@id': f'dokk:theoryofcomputing_{bibtex_key}', + 'library:author': authors, + 'library:journal': 'dokk:theoryofcomputing', + 'license:licensed_under': license, + 'library:title': title, + 'blob:at': { + '@id': f'file:/pdf/theoryofcomputing.org/{bibtex_key}.pdf', + 'blob:primary_source': pdf_url, + 'blob:retrieval_date': f'{datetime.now().year}-{datetime.now().month:02d}-{datetime.now().day:02d}' + } +} + +# Save node to file +with open(f'nodes/theoryofcomputing_{bibtex_key}.jsonld', 'w') as file: + json.dump(node, file, indent=4, ensure_ascii=False) + +print(f'[done] {bibtex_key}') + +exit() + + +authors = html_root.findall('.//*[@id="authorline"]//a') + +issn = html_root.findall('.//meta[@name="citation_issn"]') +lang = html_root.findall('.//meta[@name="citation_language"]') +site = html_root.findall('.//meta[@name="citation_abstract_html_url"]') + +assert len(authors) > 0 +assert len(issn) == 1 +assert len(lang) > 0 +assert len(site) == 1