#!/usr/bin/env python3
import json
import os
import requests
import subprocess
import time
from datetime import datetime
from pathlib import Path
# Start downloading from this comic number
assert 'START_NUMBER' in os.environ
# Stop downloading at this comic number
assert 'END_NUMBER' in os.environ
START_NUMBER = int(os.environ['START_NUMBER'])
END_NUMBER = int(os.environ['END_NUMBER'])
for n in range(START_NUMBER, END_NUMBER + 1):
# Skip https://xkcd.com/1608/ "Hoverboard"
# This is not an image/comics but a small video game
if n == 1608:
continue
# Also skip 1663. This is also a comics using JavaScript.
# TODO how should we add these comics with JavaScript?
if n == 1663:
continue
req = requests.get(url = f"https://xkcd.com/{n}/info.0.json")
if req.status_code != 200:
print(f"Could not fetch comic #{n}. Status code {req.status_code}")
continue
data = req.json()
assert n == data['num']
image_extension = Path(data['img']).suffix.lower()
assert image_extension in [ '.jpg', '.jpeg', '.png', '.gif' ]
node_id = f"xkcd_comic_{n}"
blob_filename = f'{node_id}{image_extension}'
node = {
"@context": {
"comicstrip": "dokk:vocab:comicstrip:",
"comicstrip:license": { "@type": "@id" },
"comicstrip:series": { "@type": "@id" },
"blob": "dokk:vocab:blob:"
},
"@id": f"dokk:{node_id}",
"@type": "comicstrip:ComicStripPanel",
"comicstrip:series": "dokk:xkcd",
"comicstrip:title": data['title'],
"comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}",
"comicstrip:license": "dokk:license:CC-BY-NC-2.5",
"comicstrip:website": f'https://xkcd.com/{n}/',
# TODO
# Most xkcd comics have a transcript posted on xkcd. This info covers details
# like the comics title, the date and link to the image and the title text
# Check out explainxkcd.com for a transcript of the text in the images.
# "comicstrip:transcript":
"comicstrip:xkcd_alt": data['alt'],
"comicstrip:number": data['num'],
"blob:at": {
"@id": f"file:/images/{blob_filename}",
"blob:primary_source": data['img'],
"blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}"
}
}
# Download image
wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']])
assert wget_ret.returncode == 0 # No errors
# Save node to file
with open(f'nodes/{node_id}.jsonld', 'w') as file:
json.dump(node, file, indent=4)
print(f'[done] {node_id}')
# Be nice
time.sleep(1)