From 49269047437be43bacf17396d023746891f99cc0 Mon Sep 17 00:00:00 2001 From: zPlus Date: Wed, 25 Dec 2024 09:18:23 +0100 Subject: [PATCH] Add scripts for fetching data from XKCD api. --- nodes/xkcd.jsonld | 10 ++++++ scripts/xkcd/.gitignore | 2 ++ scripts/xkcd/README | 4 +++ scripts/xkcd/xkcd.py | 77 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+) create mode 100644 nodes/xkcd.jsonld create mode 100644 scripts/xkcd/.gitignore create mode 100644 scripts/xkcd/README create mode 100755 scripts/xkcd/xkcd.py diff --git a/nodes/xkcd.jsonld b/nodes/xkcd.jsonld new file mode 100644 index 0000000..960f697 --- /dev/null +++ b/nodes/xkcd.jsonld @@ -0,0 +1,10 @@ +{ + "@context": { + "comicstrip": "dokk:vocab:comicstrip:" + }, + + "@id": "dokk:xkcd", + "@type": "comicstrip:ComicStrip", + "comicstrip:title": "XKCD", + "comicstrip:website": "https://xkcd.com" +} diff --git a/scripts/xkcd/.gitignore b/scripts/xkcd/.gitignore new file mode 100644 index 0000000..20cc517 --- /dev/null +++ b/scripts/xkcd/.gitignore @@ -0,0 +1,2 @@ +/images +/nodes diff --git a/scripts/xkcd/README b/scripts/xkcd/README new file mode 100644 index 0000000..8a82de2 --- /dev/null +++ b/scripts/xkcd/README @@ -0,0 +1,4 @@ +Download XKCD comics. API is available at https://xkcd.com/json.html + + mkdir nodes images + START_NUMBER=1 END_NUMBER=10 ./xkcd.py diff --git a/scripts/xkcd/xkcd.py b/scripts/xkcd/xkcd.py new file mode 100755 index 0000000..d8e41cb --- /dev/null +++ b/scripts/xkcd/xkcd.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +import json +import os +import requests +import subprocess +import time +from datetime import datetime +from pathlib import Path + +# Start downloading from this comic number +assert 'START_NUMBER' in os.environ + +# Stop downloading at this comic number +assert 'END_NUMBER' in os.environ + +START_NUMBER = int(os.environ['START_NUMBER']) +END_NUMBER = int(os.environ['END_NUMBER']) + +for n in range(START_NUMBER, END_NUMBER + 1): + req = requests.get(url = f"https://xkcd.com/{n}/info.0.json") + + if req.status_code != 200: + print(f"Could not fetch comic #{n}. Status code {req.status_code}") + continue + + data = req.json() + + assert n == data['num'] + + image_extension = Path(data['img']).suffix.lower() + assert image_extension in [ '.jpg', '.jpeg', '.png' ] + + node_id = f"xkcd_comic_{n}" + + blob_filename = f'{node_id}{image_extension}' + + node = { + "@context": { + "comicstrip": "dokk:vocab:comicstrip:", + "comicstrip:license": { "@type": "@id" }, + "comicstrip:series": { "@type": "@id" }, + "blob": "dokk:vocab:blob:" + }, + "@id": f"dokk:{node_id}", + "@type": "comicstrip:ComicStripPanel", + "comicstrip:series": "dokk:xkcd", + "comicstrip:title": data['title'], + "comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}", + "comicstrip:license": "dokk:license:CC-BY-NC-2.5", + "comicstrip:website": f'https://xkcd.com/{n}/', + # TODO + # Most xkcd comics have a transcript posted on xkcd. This info covers details + # like the comics title, the date and link to the image and the title text + # Check out explainxkcd.com for a transcript of the text in the images. + # "comicstrip:transcript": + "comicstrip:xkcd_alt": data['alt'], + "comicstrip:number": data['num'], + "blob:at": { + "@id": f"file:/images/{blob_filename}", + "blob:primary_source": data['img'], + "blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}" + } + } + + # Download image + wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']]) + assert wget_ret.returncode == 0 # No errors + + # Save node to file + with open(f'nodes/{node_id}.jsonld', 'w') as file: + json.dump(node, file, indent=4) + + print(f'[done] {node_id}') + + # Be nice + time.sleep(1)