ID: 941f8bafeea915925bc63b569d416709f14c3762
27 lines
—
512B —
View raw
| #!/usr/bin/env python3
# Take the plaintext as input and output RDF
import json
import sys
filename = sys.argv[1]
# Remove file extension
assert filename.endswith('.pdf.txt')
id = filename[:-8]
with open(f'txt/{filename}', 'r') as file:
plaintext = file.read()
node = {
'@context': {
'blob': 'dokk:vocab:blob:'
},
'@id': f'file:/pdf/{id}.pdf',
'blob:pdftotext': plaintext
}
with open(f'nodes/{id}.jsonld', 'w') as file:
json.dump(node, file, indent=4, ensure_ascii=False)
|