From 938e84f9bb1208df071723aa562ef0b923ad4665 Mon Sep 17 00:00:00 2001 From: zPlus Date: Sat, 4 Jan 2025 20:41:20 +0100 Subject: [PATCH] Add script for converting PDF files in the library to plaintext. --- scripts/pdf2txt/.gitignore | 2 ++ scripts/pdf2txt/README | 6 ++++++ scripts/pdf2txt/txt2rdf.py | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 scripts/pdf2txt/.gitignore create mode 100644 scripts/pdf2txt/README create mode 100755 scripts/pdf2txt/txt2rdf.py diff --git a/scripts/pdf2txt/.gitignore b/scripts/pdf2txt/.gitignore new file mode 100644 index 0000000..2b8f3c1 --- /dev/null +++ b/scripts/pdf2txt/.gitignore @@ -0,0 +1,2 @@ +/nodes +/txt diff --git a/scripts/pdf2txt/README b/scripts/pdf2txt/README new file mode 100644 index 0000000..b7a0a57 --- /dev/null +++ b/scripts/pdf2txt/README @@ -0,0 +1,6 @@ +Use pdftotext to convert library PDF files into plaintext. +https://dokk.org/manpages/debian/12/poppler-utils/pdftotext.1.en + + mkdir nodes txt + for file in *.pdf; do pdftotext -layout "$file" - > txt/"$(basename "$file").txt"; done + for file in txt/*; do ./txt2rdf.py "$(basename "$file")"; done diff --git a/scripts/pdf2txt/txt2rdf.py b/scripts/pdf2txt/txt2rdf.py new file mode 100755 index 0000000..941f8ba --- /dev/null +++ b/scripts/pdf2txt/txt2rdf.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +# Take the plaintext as input and output RDF + +import json +import sys + +filename = sys.argv[1] + +# Remove file extension +assert filename.endswith('.pdf.txt') +id = filename[:-8] + +with open(f'txt/{filename}', 'r') as file: + plaintext = file.read() + +node = { + '@context': { + 'blob': 'dokk:vocab:blob:' + }, + + '@id': f'file:/pdf/{id}.pdf', + 'blob:pdftotext': plaintext +} + +with open(f'nodes/{id}.jsonld', 'w') as file: + json.dump(node, file, indent=4, ensure_ascii=False)