diff --git a/scripts/pdf2txt/.gitignore b/scripts/pdf2txt/.gitignore new file mode 100644 index 0000000..2b8f3c1 --- /dev/null +++ b/scripts/pdf2txt/.gitignore @@ -0,0 +1,2 @@ +/nodes +/txt diff --git a/scripts/pdf2txt/README b/scripts/pdf2txt/README new file mode 100644 index 0000000..b7a0a57 --- /dev/null +++ b/scripts/pdf2txt/README @@ -0,0 +1,6 @@ +Use pdftotext to convert library PDF files into plaintext. +https://dokk.org/manpages/debian/12/poppler-utils/pdftotext.1.en + + mkdir nodes txt + for file in *.pdf; do pdftotext -layout "$file" - > txt/"$(basename "$file").txt"; done + for file in txt/*; do ./txt2rdf.py "$(basename "$file")"; done diff --git a/scripts/pdf2txt/txt2rdf.py b/scripts/pdf2txt/txt2rdf.py new file mode 100755 index 0000000..941f8ba --- /dev/null +++ b/scripts/pdf2txt/txt2rdf.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +# Take the plaintext as input and output RDF + +import json +import sys + +filename = sys.argv[1] + +# Remove file extension +assert filename.endswith('.pdf.txt') +id = filename[:-8] + +with open(f'txt/{filename}', 'r') as file: + plaintext = file.read() + +node = { + '@context': { + 'blob': 'dokk:vocab:blob:' + }, + + '@id': f'file:/pdf/{id}.pdf', + 'blob:pdftotext': plaintext +} + +with open(f'nodes/{id}.jsonld', 'w') as file: + json.dump(node, file, indent=4, ensure_ascii=False)