home » zplus/dokk.git
Author zPlus <zplus@peers.community> 2024-12-25 08:18:23
Committer zPlus <zplus@peers.community> 2024-12-25 08:18:23
Commit 4926904 (patch)
Tree 46ebcf8
Parent(s)

Add scripts for fetching data from XKCD api.


commits diff: 75ab558..4926904
4 files changed, 93 insertions, 0 deletionsdownload


Diffstat
-rw-r--r-- nodes/xkcd.jsonld 10
-rw-r--r-- scripts/xkcd/.gitignore 2
-rw-r--r-- scripts/xkcd/README 4
-rwxr-xr-x scripts/xkcd/xkcd.py 77

Diff options
View
Side
Whitespace
Context lines
Inter-hunk lines
+10/-0 A   nodes/xkcd.jsonld
index 0000000..960f697
old size: 0B - new size: 216B
new file mode: -rw-r--r--
@@ -0,0 +1,10 @@
1 + {
2 + "@context": {
3 + "comicstrip": "dokk:vocab:comicstrip:"
4 + },
5 +
6 + "@id": "dokk:xkcd",
7 + "@type": "comicstrip:ComicStrip",
8 + "comicstrip:title": "XKCD",
9 + "comicstrip:website": "https://xkcd.com"
10 + }

+2/-0 A   scripts/xkcd/.gitignore
index 0000000..20cc517
old size: 0B - new size: 15B
new file mode: -rw-r--r--
@@ -0,0 +1,2 @@
1 + /images
2 + /nodes

+4/-0 A   scripts/xkcd/README
index 0000000..8a82de2
old size: 0B - new size: 136B
new file mode: -rw-r--r--
@@ -0,0 +1,4 @@
1 + Download XKCD comics. API is available at https://xkcd.com/json.html
2 +
3 + mkdir nodes images
4 + START_NUMBER=1 END_NUMBER=10 ./xkcd.py

+77/-0 A   scripts/xkcd/xkcd.py
index 0000000..d8e41cb
old size: 0B - new size: 2K
new file mode: -rwxr-xr-x
@@ -0,0 +1,77 @@
1 + #!/usr/bin/env python3
2 +
3 + import json
4 + import os
5 + import requests
6 + import subprocess
7 + import time
8 + from datetime import datetime
9 + from pathlib import Path
10 +
11 + # Start downloading from this comic number
12 + assert 'START_NUMBER' in os.environ
13 +
14 + # Stop downloading at this comic number
15 + assert 'END_NUMBER' in os.environ
16 +
17 + START_NUMBER = int(os.environ['START_NUMBER'])
18 + END_NUMBER = int(os.environ['END_NUMBER'])
19 +
20 + for n in range(START_NUMBER, END_NUMBER + 1):
21 + req = requests.get(url = f"https://xkcd.com/{n}/info.0.json")
22 +
23 + if req.status_code != 200:
24 + print(f"Could not fetch comic #{n}. Status code {req.status_code}")
25 + continue
26 +
27 + data = req.json()
28 +
29 + assert n == data['num']
30 +
31 + image_extension = Path(data['img']).suffix.lower()
32 + assert image_extension in [ '.jpg', '.jpeg', '.png' ]
33 +
34 + node_id = f"xkcd_comic_{n}"
35 +
36 + blob_filename = f'{node_id}{image_extension}'
37 +
38 + node = {
39 + "@context": {
40 + "comicstrip": "dokk:vocab:comicstrip:",
41 + "comicstrip:license": { "@type": "@id" },
42 + "comicstrip:series": { "@type": "@id" },
43 + "blob": "dokk:vocab:blob:"
44 + },
45 + "@id": f"dokk:{node_id}",
46 + "@type": "comicstrip:ComicStripPanel",
47 + "comicstrip:series": "dokk:xkcd",
48 + "comicstrip:title": data['title'],
49 + "comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}",
50 + "comicstrip:license": "dokk:license:CC-BY-NC-2.5",
51 + "comicstrip:website": f'https://xkcd.com/{n}/',
52 + # TODO
53 + # Most xkcd comics have a transcript posted on xkcd. This info covers details
54 + # like the comics title, the date and link to the image and the title text
55 + # Check out explainxkcd.com for a transcript of the text in the images.
56 + # "comicstrip:transcript":
57 + "comicstrip:xkcd_alt": data['alt'],
58 + "comicstrip:number": data['num'],
59 + "blob:at": {
60 + "@id": f"file:/images/{blob_filename}",
61 + "blob:primary_source": data['img'],
62 + "blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}"
63 + }
64 + }
65 +
66 + # Download image
67 + wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']])
68 + assert wget_ret.returncode == 0 # No errors
69 +
70 + # Save node to file
71 + with open(f'nodes/{node_id}.jsonld', 'w') as file:
72 + json.dump(node, file, indent=4)
73 +
74 + print(f'[done] {node_id}')
75 +
76 + # Be nice
77 + time.sleep(1)