Author
|
zPlus <zplus@peers.community>
2025-01-06 13:55:02
|
Committer
|
zPlus <zplus@peers.community>
2025-01-06 13:55:02
|
Commit
|
2339dd5
(patch)
|
Tree
|
88df2d3
|
Parent(s)
|
|
pdf2txt read/write from/to pipe.
Edit the pdf2txt script to read the plaintext from stdin, and write
jsonld to stdout.
commits diff:
938e84f..2339dd5
2 files changed,
13 insertions,
12 deletions
—
download
Diffstat
Diff options
+3/-3
M scripts/pdf2txt/README
1
|
1
|
|
Use pdftotext to convert library PDF files into plaintext.
|
2
|
2
|
|
https://dokk.org/manpages/debian/12/poppler-utils/pdftotext.1.en
|
3
|
3
|
|
|
4
|
|
- |
mkdir nodes txt
|
5
|
|
- |
for file in *.pdf; do pdftotext -layout "$file" - > txt/"$(basename "$file").txt"; done
|
6
|
|
- |
for file in txt/*; do ./txt2rdf.py "$(basename "$file")"; done
|
|
4
|
+ |
mkdir nodes
|
|
5
|
+ |
export PDF_FOLDER=
|
|
6
|
+ |
i=0; for file in $(find $PDF_FOLDER -type f -name *.pdf); do i=$(( i + 1 )); pdftotext -layout "$file" - | ./pdf2rdf.py $file > nodes/$i.jsonld; done
|
+10/-9
R scripts/pdf2txt/txt2rdf.py -> scripts/pdf2txt/pdf2txt.py
3
|
3
|
|
# Take the plaintext as input and output RDF
|
4
|
4
|
|
|
5
|
5
|
|
import json
|
|
6
|
+ |
import os
|
6
|
7
|
|
import sys
|
|
8
|
+ |
from pathlib import Path
|
7
|
9
|
|
|
|
10
|
+ |
assert 'PDF_FOLDER' in os.environ
|
|
11
|
+ |
assert len(sys.argv) == 2
|
|
12
|
+ |
PDF_FOLDER = os.environ['PDF_FOLDER']
|
8
|
13
|
|
filename = sys.argv[1]
|
|
14
|
+ |
assert filename.endswith('.pdf')
|
|
15
|
+ |
plaintext = sys.stdin.read()
|
9
|
16
|
|
|
10
|
|
- |
# Remove file extension
|
11
|
|
- |
assert filename.endswith('.pdf.txt')
|
12
|
|
- |
id = filename[:-8]
|
13
|
|
- |
|
14
|
|
- |
with open(f'txt/{filename}', 'r') as file:
|
15
|
|
- |
plaintext = file.read()
|
|
17
|
+ |
file_id = Path(filename).relative_to(Path(PDF_FOLDER))
|
16
|
18
|
|
|
17
|
19
|
|
node = {
|
18
|
20
|
|
'@context': {
|
19
|
21
|
|
'blob': 'dokk:vocab:blob:'
|
20
|
22
|
|
},
|
21
|
23
|
|
|
22
|
|
- |
'@id': f'file:/pdf/{id}.pdf',
|
|
24
|
+ |
'@id': f'file:/pdf/{file_id}',
|
23
|
25
|
|
'blob:pdftotext': plaintext
|
24
|
26
|
|
}
|
25
|
27
|
|
|
26
|
|
- |
with open(f'nodes/{id}.jsonld', 'w') as file:
|
27
|
|
- |
json.dump(node, file, indent=4, ensure_ascii=False)
|
|
28
|
+ |
print(json.dumps(node, indent=4, ensure_ascii=False))
|