home » zplus/dokk.git
ID: 56d6865cce9071e14d6b469b8cdf301e76caaf44
28 lines — 563B — View raw


#!/usr/bin/env python3

# Take the plaintext as input and output RDF

import json
import os
import sys
from pathlib import Path

assert 'PDF_FOLDER' in os.environ
assert len(sys.argv) == 2
PDF_FOLDER = os.environ['PDF_FOLDER']
filename = sys.argv[1]
assert filename.endswith('.pdf')
plaintext = sys.stdin.read()

file_id = Path(filename).relative_to(Path(PDF_FOLDER))

node = {
    '@context': {
        'blob': 'dokk:vocab:blob:'
    },

    '@id': f'file:/pdf/{file_id}',
    'blob:pdftotext': plaintext
}

print(json.dumps(node, indent=4, ensure_ascii=False))