home » zplus/dokk.git
ID: e793b0dbfe1ce670334ce1b6762fec32f0034fb9
108 lines — 3K — View raw


#!/usr/bin/env python3

# This script expect an article folder as input, for example
# ./theoryofcomputing.org/articles/v001a001

import bibtexparser
import json
import os
import re
import subprocess
import sys
from datetime import datetime
from lxml import html

article_path = sys.argv[1]
assert os.path.isdir(article_path)

bibtex = None
try:
    bibtex = bibtexparser.parse_file(f'{article_path}/bibtex.txt')
except Exception:
    # Some articles do not have a bibtex file. They only contain
    # "forewords" for special issues.
    exit()

bibtex_key = bibtex.entries[0].key
assert re.match('^v[0-9]{3}a[0-9]{3}$', bibtex_key)

# Extract data from bibtex
title   = bibtex.entries[0].fields_dict['title'].value
authors = bibtex.entries[0].fields_dict['author'].value
doi     = bibtex.entries[0].fields_dict['doi'].value
site    = bibtex.entries[0].fields_dict['URL'].value
license = None

# In bibtex, authors are written as "surname, name [and ...]"
# therefore we split the string to get the author names
authors = [ ' '.join([ name_part.strip() for name_part in author.split(',')[::-1] ]) for author in authors.split(' and ') ]


# Extract additional data from HTML using XPaths
html_tree = html.parse(f'{article_path}/index.html')

license_node = html_tree.findall('.//*[@id="copyright"]//a[@rel="license"]')
assert len(license_node) == 1
license_url = license_node[0].get('href')

if license_url == 'http://creativecommons.org/licenses/by/3.0/':
    license = 'dokk:license:CC-BY-3.0'
elif license_url == 'http://creativecommons.org/licenses/by-nd/2.0/':
    license = 'dokk:license:CC-BY-ND-2.0'
assert license

pdf = html_tree.findall('.//meta[@name="citation_pdf_url"]')
assert len(pdf) == 1
pdf_url = pdf[0].get('content')
assert pdf_url == f'https://theoryofcomputing.org/articles/{bibtex_key}/{bibtex_key}.pdf'

# Copy the PDF file of the article to the output folder
pdf_source_file = pdf_url[8:]
assert os.path.isfile(pdf_source_file)
cp_ret = subprocess.run(['cp', pdf_source_file, './pdf/theoryofcomputing.org/'])
assert cp_ret.returncode == 0 # No errors

# Create the node
node = {
    '@context': {
        'library': 'dokk:vocab:library:',
        'license': 'dokk:vocab:license:',
        'library:journal': { '@type': '@id' },
        'license:licensed_under': { '@type': '@id' },
        'blob': 'dokk:vocab:blob:'
    },
    '@type': [
        'library:Item',
        'library:JournalArticle'
    ],
    '@id': f'dokk:theoryofcomputing_{bibtex_key}',
    'library:author': authors,
    'library:journal': 'dokk:theoryofcomputing',
    'license:licensed_under': license,
    'library:title': title,
    'blob:at': {
        '@id': f'file:/pdf/theoryofcomputing.org/{bibtex_key}.pdf',
        'blob:primary_source': pdf_url,
        'blob:retrieval_date': f'{datetime.now().year}-{datetime.now().month:02d}-{datetime.now().day:02d}'
    }
}

# Save node to file
with open(f'nodes/theoryofcomputing_{bibtex_key}.jsonld', 'w') as file:
    json.dump(node, file, indent=4, ensure_ascii=False)

print(f'[done] {bibtex_key}')

exit()


authors = html_root.findall('.//*[@id="authorline"]//a')

issn    = html_root.findall('.//meta[@name="citation_issn"]')
lang    = html_root.findall('.//meta[@name="citation_language"]')
site    = html_root.findall('.//meta[@name="citation_abstract_html_url"]')

assert len(authors) > 0
assert len(issn) == 1
assert len(lang) > 0
assert len(site) == 1