Author
|
zPlus <zplus@peers.community>
2025-01-04 19:41:20
|
Committer
|
zPlus <zplus@peers.community>
2025-01-04 19:41:20
|
Commit
|
938e84f
(patch)
|
Tree
|
6da05eb
|
Parent(s)
|
|
Add script for converting PDF files in the library to plaintext.
commits diff:
7dee1eb..938e84f
3 files changed,
35 insertions,
0 deletions
—
download
Diffstat
Diff options
+2/-0
A scripts/pdf2txt/.gitignore
index
0000000..2b8f3c1
|
old size: 0B
-
new size: 12B
|
new file mode: -rw-r--r--
|
+6/-0
A scripts/pdf2txt/README
index
0000000..b7a0a57
|
old size: 0B
-
new size: 304B
|
new file mode: -rw-r--r--
|
|
1
|
+ |
Use pdftotext to convert library PDF files into plaintext.
|
|
2
|
+ |
https://dokk.org/manpages/debian/12/poppler-utils/pdftotext.1.en
|
|
3
|
+ |
|
|
4
|
+ |
mkdir nodes txt
|
|
5
|
+ |
for file in *.pdf; do pdftotext -layout "$file" - > txt/"$(basename "$file").txt"; done
|
|
6
|
+ |
for file in txt/*; do ./txt2rdf.py "$(basename "$file")"; done
|
+27/-0
A scripts/pdf2txt/txt2rdf.py
index
0000000..941f8ba
|
old size: 0B
-
new size: 512B
|
new file mode: -rwxr-xr-x
|
|
1
|
+ |
#!/usr/bin/env python3
|
|
2
|
+ |
|
|
3
|
+ |
# Take the plaintext as input and output RDF
|
|
4
|
+ |
|
|
5
|
+ |
import json
|
|
6
|
+ |
import sys
|
|
7
|
+ |
|
|
8
|
+ |
filename = sys.argv[1]
|
|
9
|
+ |
|
|
10
|
+ |
# Remove file extension
|
|
11
|
+ |
assert filename.endswith('.pdf.txt')
|
|
12
|
+ |
id = filename[:-8]
|
|
13
|
+ |
|
|
14
|
+ |
with open(f'txt/{filename}', 'r') as file:
|
|
15
|
+ |
plaintext = file.read()
|
|
16
|
+ |
|
|
17
|
+ |
node = {
|
|
18
|
+ |
'@context': {
|
|
19
|
+ |
'blob': 'dokk:vocab:blob:'
|
|
20
|
+ |
},
|
|
21
|
+ |
|
|
22
|
+ |
'@id': f'file:/pdf/{id}.pdf',
|
|
23
|
+ |
'blob:pdftotext': plaintext
|
|
24
|
+ |
}
|
|
25
|
+ |
|
|
26
|
+ |
with open(f'nodes/{id}.jsonld', 'w') as file:
|
|
27
|
+ |
json.dump(node, file, indent=4, ensure_ascii=False)
|