home » zplus/dokk.git
ID: 941f8bafeea915925bc63b569d416709f14c3762
27 lines — 512B — View raw


#!/usr/bin/env python3

# Take the plaintext as input and output RDF

import json
import sys

filename = sys.argv[1]

# Remove file extension
assert filename.endswith('.pdf.txt')
id = filename[:-8]

with open(f'txt/{filename}', 'r') as file:
    plaintext = file.read()

node = {
    '@context': {
        'blob': 'dokk:vocab:blob:'
    },

    '@id': f'file:/pdf/{id}.pdf',
    'blob:pdftotext': plaintext
}

with open(f'nodes/{id}.jsonld', 'w') as file:
    json.dump(node, file, indent=4, ensure_ascii=False)