|
1
|
+ |
#!/usr/bin/env python3
|
|
2
|
+ |
|
|
3
|
+ |
import json
|
|
4
|
+ |
import os
|
|
5
|
+ |
import requests
|
|
6
|
+ |
import subprocess
|
|
7
|
+ |
import time
|
|
8
|
+ |
from datetime import datetime
|
|
9
|
+ |
from pathlib import Path
|
|
10
|
+ |
|
|
11
|
+ |
# Start downloading from this comic number
|
|
12
|
+ |
assert 'START_NUMBER' in os.environ
|
|
13
|
+ |
|
|
14
|
+ |
# Stop downloading at this comic number
|
|
15
|
+ |
assert 'END_NUMBER' in os.environ
|
|
16
|
+ |
|
|
17
|
+ |
START_NUMBER = int(os.environ['START_NUMBER'])
|
|
18
|
+ |
END_NUMBER = int(os.environ['END_NUMBER'])
|
|
19
|
+ |
|
|
20
|
+ |
for n in range(START_NUMBER, END_NUMBER + 1):
|
|
21
|
+ |
req = requests.get(url = f"https://xkcd.com/{n}/info.0.json")
|
|
22
|
+ |
|
|
23
|
+ |
if req.status_code != 200:
|
|
24
|
+ |
print(f"Could not fetch comic #{n}. Status code {req.status_code}")
|
|
25
|
+ |
continue
|
|
26
|
+ |
|
|
27
|
+ |
data = req.json()
|
|
28
|
+ |
|
|
29
|
+ |
assert n == data['num']
|
|
30
|
+ |
|
|
31
|
+ |
image_extension = Path(data['img']).suffix.lower()
|
|
32
|
+ |
assert image_extension in [ '.jpg', '.jpeg', '.png' ]
|
|
33
|
+ |
|
|
34
|
+ |
node_id = f"xkcd_comic_{n}"
|
|
35
|
+ |
|
|
36
|
+ |
blob_filename = f'{node_id}{image_extension}'
|
|
37
|
+ |
|
|
38
|
+ |
node = {
|
|
39
|
+ |
"@context": {
|
|
40
|
+ |
"comicstrip": "dokk:vocab:comicstrip:",
|
|
41
|
+ |
"comicstrip:license": { "@type": "@id" },
|
|
42
|
+ |
"comicstrip:series": { "@type": "@id" },
|
|
43
|
+ |
"blob": "dokk:vocab:blob:"
|
|
44
|
+ |
},
|
|
45
|
+ |
"@id": f"dokk:{node_id}",
|
|
46
|
+ |
"@type": "comicstrip:ComicStripPanel",
|
|
47
|
+ |
"comicstrip:series": "dokk:xkcd",
|
|
48
|
+ |
"comicstrip:title": data['title'],
|
|
49
|
+ |
"comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}",
|
|
50
|
+ |
"comicstrip:license": "dokk:license:CC-BY-NC-2.5",
|
|
51
|
+ |
"comicstrip:website": f'https://xkcd.com/{n}/',
|
|
52
|
+ |
# TODO
|
|
53
|
+ |
# Most xkcd comics have a transcript posted on xkcd. This info covers details
|
|
54
|
+ |
# like the comics title, the date and link to the image and the title text
|
|
55
|
+ |
# Check out explainxkcd.com for a transcript of the text in the images.
|
|
56
|
+ |
# "comicstrip:transcript":
|
|
57
|
+ |
"comicstrip:xkcd_alt": data['alt'],
|
|
58
|
+ |
"comicstrip:number": data['num'],
|
|
59
|
+ |
"blob:at": {
|
|
60
|
+ |
"@id": f"file:/images/{blob_filename}",
|
|
61
|
+ |
"blob:primary_source": data['img'],
|
|
62
|
+ |
"blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}"
|
|
63
|
+ |
}
|
|
64
|
+ |
}
|
|
65
|
+ |
|
|
66
|
+ |
# Download image
|
|
67
|
+ |
wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']])
|
|
68
|
+ |
assert wget_ret.returncode == 0 # No errors
|
|
69
|
+ |
|
|
70
|
+ |
# Save node to file
|
|
71
|
+ |
with open(f'nodes/{node_id}.jsonld', 'w') as file:
|
|
72
|
+ |
json.dump(node, file, indent=4)
|
|
73
|
+ |
|
|
74
|
+ |
print(f'[done] {node_id}')
|
|
75
|
+ |
|
|
76
|
+ |
# Be nice
|
|
77
|
+ |
time.sleep(1)
|