diff --git a/scripts/pdf2txt/README b/scripts/pdf2txt/README index b7a0a57..ee19bd7 100644 --- a/scripts/pdf2txt/README +++ b/scripts/pdf2txt/README @@ -1,6 +1,6 @@ Use pdftotext to convert library PDF files into plaintext. https://dokk.org/manpages/debian/12/poppler-utils/pdftotext.1.en - mkdir nodes txt - for file in *.pdf; do pdftotext -layout "$file" - > txt/"$(basename "$file").txt"; done - for file in txt/*; do ./txt2rdf.py "$(basename "$file")"; done + mkdir nodes + export PDF_FOLDER= + i=0; for file in $(find $PDF_FOLDER -type f -name *.pdf); do i=$(( i + 1 )); pdftotext -layout "$file" - | ./pdf2rdf.py $file > nodes/$i.jsonld; done diff --git a/scripts/pdf2txt/txt2rdf.py b/scripts/pdf2txt/pdf2txt.py similarity index 55% rename from scripts/pdf2txt/txt2rdf.py rename to scripts/pdf2txt/pdf2txt.py index 941f8ba..56d6865 100755 --- a/scripts/pdf2txt/txt2rdf.py +++ b/scripts/pdf2txt/pdf2txt.py @@ -3,25 +3,26 @@ # Take the plaintext as input and output RDF import json +import os import sys +from pathlib import Path +assert 'PDF_FOLDER' in os.environ +assert len(sys.argv) == 2 +PDF_FOLDER = os.environ['PDF_FOLDER'] filename = sys.argv[1] +assert filename.endswith('.pdf') +plaintext = sys.stdin.read() -# Remove file extension -assert filename.endswith('.pdf.txt') -id = filename[:-8] - -with open(f'txt/{filename}', 'r') as file: - plaintext = file.read() +file_id = Path(filename).relative_to(Path(PDF_FOLDER)) node = { '@context': { 'blob': 'dokk:vocab:blob:' }, - '@id': f'file:/pdf/{id}.pdf', + '@id': f'file:/pdf/{file_id}', 'blob:pdftotext': plaintext } -with open(f'nodes/{id}.jsonld', 'w') as file: - json.dump(node, file, indent=4, ensure_ascii=False) +print(json.dumps(node, indent=4, ensure_ascii=False))