From 2339dd531becb3e7578d5fd0eacbb735ab2a414c Mon Sep 17 00:00:00 2001 From: zPlus Date: Mon, 6 Jan 2025 14:55:02 +0100 Subject: [PATCH] pdf2txt read/write from/to pipe. Edit the pdf2txt script to read the plaintext from stdin, and write jsonld to stdout. --- scripts/pdf2txt/README | 6 +++--- scripts/pdf2txt/pdf2txt.py | 28 ++++++++++++++++++++++++++++ scripts/pdf2txt/txt2rdf.py | 27 --------------------------- 3 files changed, 31 insertions(+), 30 deletions(-) create mode 100755 scripts/pdf2txt/pdf2txt.py delete mode 100755 scripts/pdf2txt/txt2rdf.py diff --git a/scripts/pdf2txt/README b/scripts/pdf2txt/README index b7a0a57..ee19bd7 100644 --- a/scripts/pdf2txt/README +++ b/scripts/pdf2txt/README @@ -1,6 +1,6 @@ Use pdftotext to convert library PDF files into plaintext. https://dokk.org/manpages/debian/12/poppler-utils/pdftotext.1.en - mkdir nodes txt - for file in *.pdf; do pdftotext -layout "$file" - > txt/"$(basename "$file").txt"; done - for file in txt/*; do ./txt2rdf.py "$(basename "$file")"; done + mkdir nodes + export PDF_FOLDER= + i=0; for file in $(find $PDF_FOLDER -type f -name *.pdf); do i=$(( i + 1 )); pdftotext -layout "$file" - | ./pdf2rdf.py $file > nodes/$i.jsonld; done diff --git a/scripts/pdf2txt/pdf2txt.py b/scripts/pdf2txt/pdf2txt.py new file mode 100755 index 0000000..56d6865 --- /dev/null +++ b/scripts/pdf2txt/pdf2txt.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +# Take the plaintext as input and output RDF + +import json +import os +import sys +from pathlib import Path + +assert 'PDF_FOLDER' in os.environ +assert len(sys.argv) == 2 +PDF_FOLDER = os.environ['PDF_FOLDER'] +filename = sys.argv[1] +assert filename.endswith('.pdf') +plaintext = sys.stdin.read() + +file_id = Path(filename).relative_to(Path(PDF_FOLDER)) + +node = { + '@context': { + 'blob': 'dokk:vocab:blob:' + }, + + '@id': f'file:/pdf/{file_id}', + 'blob:pdftotext': plaintext +} + +print(json.dumps(node, indent=4, ensure_ascii=False)) diff --git a/scripts/pdf2txt/txt2rdf.py b/scripts/pdf2txt/txt2rdf.py deleted file mode 100755 index 941f8ba..0000000 --- a/scripts/pdf2txt/txt2rdf.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python3 - -# Take the plaintext as input and output RDF - -import json -import sys - -filename = sys.argv[1] - -# Remove file extension -assert filename.endswith('.pdf.txt') -id = filename[:-8] - -with open(f'txt/{filename}', 'r') as file: - plaintext = file.read() - -node = { - '@context': { - 'blob': 'dokk:vocab:blob:' - }, - - '@id': f'file:/pdf/{id}.pdf', - 'blob:pdftotext': plaintext -} - -with open(f'nodes/{id}.jsonld', 'w') as file: - json.dump(node, file, indent=4, ensure_ascii=False)