ID: 56d6865cce9071e14d6b469b8cdf301e76caaf44
28 lines
—
563B —
View raw
| #!/usr/bin/env python3
# Take the plaintext as input and output RDF
import json
import os
import sys
from pathlib import Path
assert 'PDF_FOLDER' in os.environ
assert len(sys.argv) == 2
PDF_FOLDER = os.environ['PDF_FOLDER']
filename = sys.argv[1]
assert filename.endswith('.pdf')
plaintext = sys.stdin.read()
file_id = Path(filename).relative_to(Path(PDF_FOLDER))
node = {
'@context': {
'blob': 'dokk:vocab:blob:'
},
'@id': f'file:/pdf/{file_id}',
'blob:pdftotext': plaintext
}
print(json.dumps(node, indent=4, ensure_ascii=False))
|