Commit: 49269047437be43bacf17396d023746891f99cc0

1

+

#!/usr/bin/env python3

2

+

3

+

import json

4

+

import os

5

+

import requests

6

+

import subprocess

7

+

import time

8

+

from datetime import datetime

9

+

from pathlib import Path

10

+

11

+

# Start downloading from this comic number

12

+

assert 'START_NUMBER' in os.environ

13

+

14

+

# Stop downloading at this comic number

15

+

assert 'END_NUMBER' in os.environ

16

+

17

+

START_NUMBER = int(os.environ['START_NUMBER'])

18

+

END_NUMBER = int(os.environ['END_NUMBER'])

19

+

20

+

for n in range(START_NUMBER, END_NUMBER + 1):

21

+

req = requests.get(url = f"https://xkcd.com/{n}/info.0.json")

22

+

23

+

if req.status_code != 200:

24

+

print(f"Could not fetch comic #{n}. Status code {req.status_code}")

25

+

continue

26

+

27

+

data = req.json()

28

+

29

+

assert n == data['num']

30

+

31

+

image_extension = Path(data['img']).suffix.lower()

32

+

assert image_extension in [ '.jpg', '.jpeg', '.png' ]

33

+

34

+

node_id = f"xkcd_comic_{n}"

35

+

36

+

blob_filename = f'{node_id}{image_extension}'

37

+

38

+

node = {

39

+

"@context": {

40

+

"comicstrip": "dokk:vocab:comicstrip:",

41

+

"comicstrip:license": { "@type": "@id" },

42

+

"comicstrip:series": { "@type": "@id" },

43

+

"blob": "dokk:vocab:blob:"

44

+

},

45

+

"@id": f"dokk:{node_id}",

46

+

"@type": "comicstrip:ComicStripPanel",

47

+

"comicstrip:series": "dokk:xkcd",

48

+

"comicstrip:title": data['title'],

49

+

"comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}",

50

+

"comicstrip:license": "dokk:license:CC-BY-NC-2.5",

51

+

"comicstrip:website": f'https://xkcd.com/{n}/',

52

+

# TODO

53

+

# Most xkcd comics have a transcript posted on xkcd. This info covers details

54

+

# like the comics title, the date and link to the image and the title text

55

+

# Check out explainxkcd.com for a transcript of the text in the images.

56

+

# "comicstrip:transcript":

57

+

"comicstrip:xkcd_alt": data['alt'],

58

+

"comicstrip:number": data['num'],

59

+

"blob:at": {

60

+

"@id": f"file:/images/{blob_filename}",

61

+

"blob:primary_source": data['img'],

62

+

"blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}"

63

+

}

64

+

}

65

+

66

+

# Download image

67

+

wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']])

68

+

assert wget_ret.returncode == 0 # No errors

69

+

70

+

# Save node to file

71

+

with open(f'nodes/{node_id}.jsonld', 'w') as file:

72

+

json.dump(node, file, indent=4)

73

+

74

+

print(f'[done] {node_id}')

75

+

76

+

# Be nice

77

+

time.sleep(1)

Author	zPlus <zplus@peers.community> 2024-12-25 08:18:23
Committer	zPlus <zplus@peers.community> 2024-12-25 08:18:23
Commit	4926904 (patch)
Tree	46ebcf8
Parent(s)	75ab558 (diff)

-rw-r--r--	nodes/xkcd.jsonld	10
-rw-r--r--	scripts/xkcd/.gitignore	2
-rw-r--r--	scripts/xkcd/README	4
-rwxr-xr-x	scripts/xkcd/xkcd.py	77

View	Unified Unified (raw) Side by side
Side	Normal Reverse
Whitespace	Include Ignore all Ignore amount changes Ignore at end of line
Context lines
Inter-hunk lines

		@@ -0,0 +1,10 @@
1	+	{
2	+	"@context": {
3	+	"comicstrip": "dokk:vocab:comicstrip:"
4	+	},
5	+
6	+	"@id": "dokk:xkcd",
7	+	"@type": "comicstrip:ComicStrip",
8	+	"comicstrip:title": "XKCD",
9	+	"comicstrip:website": "https://xkcd.com"
10	+	}

		@@ -0,0 +1,4 @@
1	+	Download XKCD comics. API is available at https://xkcd.com/json.html
2	+
3	+	mkdir nodes images
4	+	START_NUMBER=1 END_NUMBER=10 ./xkcd.py

index 0000000..960f697
old size: 0B - new size: 216B
new file mode: -rw-r--r--

index 0000000..20cc517
old size: 0B - new size: 15B
new file mode: -rw-r--r--

index 0000000..8a82de2
old size: 0B - new size: 136B
new file mode: -rw-r--r--

index 0000000..d8e41cb
old size: 0B - new size: 2K
new file mode: -rwxr-xr-x

		@@ -0,0 +1,77 @@
1	+	#!/usr/bin/env python3
2	+
3	+	import json
4	+	import os
5	+	import requests
6	+	import subprocess
7	+	import time
8	+	from datetime import datetime
9	+	from pathlib import Path
10	+
11	+	# Start downloading from this comic number
12	+	assert 'START_NUMBER' in os.environ
13	+
14	+	# Stop downloading at this comic number
15	+	assert 'END_NUMBER' in os.environ
16	+
17	+	START_NUMBER = int(os.environ['START_NUMBER'])
18	+	END_NUMBER = int(os.environ['END_NUMBER'])
19	+
20	+	for n in range(START_NUMBER, END_NUMBER + 1):
21	+	req = requests.get(url = f"https://xkcd.com/{n}/info.0.json")
22	+
23	+	if req.status_code != 200:
24	+	print(f"Could not fetch comic #{n}. Status code {req.status_code}")
25	+	continue
26	+
27	+	data = req.json()
28	+
29	+	assert n == data['num']
30	+
31	+	image_extension = Path(data['img']).suffix.lower()
32	+	assert image_extension in [ '.jpg', '.jpeg', '.png' ]
33	+
34	+	node_id = f"xkcd_comic_{n}"
35	+
36	+	blob_filename = f'{node_id}{image_extension}'
37	+
38	+	node = {
39	+	"@context": {
40	+	"comicstrip": "dokk:vocab:comicstrip:",
41	+	"comicstrip:license": { "@type": "@id" },
42	+	"comicstrip:series": { "@type": "@id" },
43	+	"blob": "dokk:vocab:blob:"
44	+	},
45	+	"@id": f"dokk:{node_id}",
46	+	"@type": "comicstrip:ComicStripPanel",
47	+	"comicstrip:series": "dokk:xkcd",
48	+	"comicstrip:title": data['title'],
49	+	"comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}",
50	+	"comicstrip:license": "dokk:license:CC-BY-NC-2.5",
51	+	"comicstrip:website": f'https://xkcd.com/{n}/',
52	+	# TODO
53	+	# Most xkcd comics have a transcript posted on xkcd. This info covers details
54	+	# like the comics title, the date and link to the image and the title text
55	+	# Check out explainxkcd.com for a transcript of the text in the images.
56	+	# "comicstrip:transcript":
57	+	"comicstrip:xkcd_alt": data['alt'],
58	+	"comicstrip:number": data['num'],
59	+	"blob:at": {
60	+	"@id": f"file:/images/{blob_filename}",
61	+	"blob:primary_source": data['img'],
62	+	"blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}"
63	+	}
64	+	}
65	+
66	+	# Download image
67	+	wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']])
68	+	assert wget_ret.returncode == 0 # No errors
69	+
70	+	# Save node to file
71	+	with open(f'nodes/{node_id}.jsonld', 'w') as file:
72	+	json.dump(node, file, indent=4)
73	+
74	+	print(f'[done] {node_id}')
75	+
76	+	# Be nice
77	+	time.sleep(1)

		@@ -0,0 +1,2 @@
1	+	/images
2	+	/nodes