From 49269047437be43bacf17396d023746891f99cc0 Mon Sep 17 00:00:00 2001
From: zPlus <zplus@peers.community>
Date: Wed, 25 Dec 2024 09:18:23 +0100
Subject: [PATCH] Add scripts for fetching data from XKCD api.

---
 nodes/xkcd.jsonld       | 10 ++++++
 scripts/xkcd/.gitignore |  2 ++
 scripts/xkcd/README     |  4 +++
 scripts/xkcd/xkcd.py    | 77 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+)
 create mode 100644 nodes/xkcd.jsonld
 create mode 100644 scripts/xkcd/.gitignore
 create mode 100644 scripts/xkcd/README
 create mode 100755 scripts/xkcd/xkcd.py

diff --git a/nodes/xkcd.jsonld b/nodes/xkcd.jsonld
new file mode 100644
index 0000000..960f697
--- /dev/null
+++ b/nodes/xkcd.jsonld
@@ -0,0 +1,10 @@
+{
+    "@context": {
+        "comicstrip": "dokk:vocab:comicstrip:"
+    },
+
+    "@id": "dokk:xkcd",
+    "@type": "comicstrip:ComicStrip",
+    "comicstrip:title": "XKCD",
+    "comicstrip:website": "https://xkcd.com"
+}
diff --git a/scripts/xkcd/.gitignore b/scripts/xkcd/.gitignore
new file mode 100644
index 0000000..20cc517
--- /dev/null
+++ b/scripts/xkcd/.gitignore
@@ -0,0 +1,2 @@
+/images
+/nodes
diff --git a/scripts/xkcd/README b/scripts/xkcd/README
new file mode 100644
index 0000000..8a82de2
--- /dev/null
+++ b/scripts/xkcd/README
@@ -0,0 +1,4 @@
+Download XKCD comics. API is available at https://xkcd.com/json.html
+
+    mkdir nodes images
+    START_NUMBER=1 END_NUMBER=10 ./xkcd.py
diff --git a/scripts/xkcd/xkcd.py b/scripts/xkcd/xkcd.py
new file mode 100755
index 0000000..d8e41cb
--- /dev/null
+++ b/scripts/xkcd/xkcd.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import requests
+import subprocess
+import time
+from datetime import datetime
+from pathlib import Path
+
+# Start downloading from this comic number
+assert 'START_NUMBER' in os.environ
+
+# Stop downloading at this comic number
+assert 'END_NUMBER' in os.environ
+
+START_NUMBER = int(os.environ['START_NUMBER'])
+END_NUMBER = int(os.environ['END_NUMBER'])
+
+for n in range(START_NUMBER, END_NUMBER + 1):
+    req = requests.get(url = f"https://xkcd.com/{n}/info.0.json")
+
+    if req.status_code != 200:
+        print(f"Could not fetch comic #{n}. Status code {req.status_code}")
+        continue
+
+    data = req.json()
+
+    assert n == data['num']
+
+    image_extension = Path(data['img']).suffix.lower()
+    assert image_extension in [ '.jpg', '.jpeg', '.png' ]
+
+    node_id = f"xkcd_comic_{n}"
+
+    blob_filename = f'{node_id}{image_extension}'
+
+    node = {
+        "@context": {
+            "comicstrip": "dokk:vocab:comicstrip:",
+            "comicstrip:license": { "@type": "@id" },
+            "comicstrip:series": { "@type": "@id" },
+            "blob": "dokk:vocab:blob:"
+        },
+        "@id": f"dokk:{node_id}",
+        "@type": "comicstrip:ComicStripPanel",
+        "comicstrip:series": "dokk:xkcd",
+        "comicstrip:title": data['title'],
+        "comicstrip:published": f"{data['year']}-{int(data['month']):02d}-{int(data['day']):02d}",
+        "comicstrip:license": "dokk:license:CC-BY-NC-2.5",
+        "comicstrip:website": f'https://xkcd.com/{n}/',
+        # TODO
+        # Most xkcd comics have a transcript posted on xkcd. This info covers details
+        # like the comics title, the date and link to the image and the title text
+        # Check out explainxkcd.com for a transcript of the text in the images.
+        # "comicstrip:transcript":
+        "comicstrip:xkcd_alt": data['alt'],
+        "comicstrip:number": data['num'],
+        "blob:at": {
+            "@id": f"file:/images/{blob_filename}",
+            "blob:primary_source": data['img'],
+            "blob:retrieval_date": f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}"
+        }
+    }
+
+    # Download image
+    wget_ret = subprocess.run(['wget', '--quiet', '--output-document', f'images/{blob_filename}', data['img']])
+    assert wget_ret.returncode == 0 # No errors
+
+    # Save node to file
+    with open(f'nodes/{node_id}.jsonld', 'w') as file:
+        json.dump(node, file, indent=4)
+
+    print(f'[done] {node_id}')
+
+    # Be nice
+    time.sleep(1)