initstring · xmo-odoo · Jan 16, 2026
diff --git a/utilities/kym_scrape.py b/utilities/kym_scrape.py
@@ -4,7 +4,7 @@
 Know Your Meme Scraper
 Grabs all titles from https://knowyourmeme.com
 
-Puts output into memes.txt
+Sends output to stdout.
 
 Used to feed into password cracking wordlists like
 https://github.com/initstring/passphrase-wordlist
@@ -13,95 +13,54 @@
 """
 
 import html
-import time
+import itertools
 import re
 import sys
-import requests
+import time
+import urllib.request
 
 # The "all" URL allows autoscrolling
 KYM_URL = 'https://knowyourmeme.com/memes/all/page'
 
 # Regex to grab all formatted titles
 RE_TITLE = re.compile(r'<h3[^>]*>\s*(.*?)\s*</h3>')
 
-# Text to know when we reached end of line
-NO_MORE = 'There are no entries for this category'
-
 # Need real headers to get past WAF
 HEADERS = {'User-Agent': 'Mozilla/5.0'}
 
-# Out file
-OUTFILE = 'memes.txt'
-
-# File for in-process scraping
-LOGFILE = 'memes-incomplete.txt'
-
 # Sleep to avoid IP ban
 SLEEP = 3
 
-def write_log(phrases):
-    """
-    Logs phrases as the program runs
-
-    Used for troubleshooting or to at least have _something_ in the case of
-    IP ban, failure, etc
-    """
-    with open(LOGFILE, 'a') as logfile:
-        for phrase in phrases:
-            phrase = html.unescape(phrase)
-            logfile.write(phrase + '\n')
-
-def write_final(phrases):
-    """
-    Writes all phrases to a log file
-    """
-    # Unescape the HTML and write the phrases out
-    with open(OUTFILE, 'w') as outfile:
-        for phrase in phrases:
-            phrase = html.unescape(phrase)
-            outfile.write(phrase + '\n')
-
 def scrape_pages():
     """
     Loops through all pages of kym
     """
-    page = 0
-    phrases = set([])
+    phrases = set()
 
-    while True:
+    for page in itertools.count():
         # Build the URL based on auto-scroll behaviour
-        url = "{}/{}".format(KYM_URL, page)
-        response = requests.get(url, headers=HEADERS)
-
-        # Check for IP ban
-        if response.status_code == 403:
-            print("\n[!] You have been IP banned. Oops.")
-            sys.exit()
-
-        # Return if no more results
-        if response.status_code == 404:
-            print("\n[*] Reached end of line at page {}. Exiting"
-                  .format(page))
-            return phrases
-
-        # Clear stdout for ongoing notifications
-        sys.stdout.flush()
-        sys.stdout.write(" " * 20)
-        sys.stdout.write("\r")
+        req = urllib.request.Request(f"{KYM_URL}/{page}", headers=HEADERS)
+        try:
+            response = urllib.request.urlopen(req)
+        except urllib.error.HTTPError as e:
+            # Check for IP ban
+            if e.code == 403:
+                sys.exit("\n[!] You have been IP banned. Oops.")
+
+            # Return if no more results
+            if e.code == 404:
+                print(f"\n[*] Reached end of line at page {page}. Exiting", file=sys.stderr)
+                return phrases
 
         # Grab phrases from the raw text and add to set
-        new_phrases = re.findall(RE_TITLE, response.text)
+        new_phrases = set(RE_TITLE.findall(response.read().decode()))
+        for new_phrase in new_phrases - phrases:
+            print(html.unescape(new_phrase))
         phrases.update(new_phrases)
 
-        # Write the new phrases to an ongoing logile
-        write_log(new_phrases)
-
         # Update the patiently waiting user
-        sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}"
-                         .format(page, len(new_phrases), len(phrases)))
-
-        # Increment the page for the next loop
-        page += 1
+        sys.stderr.write(f"[*] Page: {page}, Phrases: {len(new_phrases)}, Unique Phrases: {len(phrases)}\r")
+        sys.stderr.flush()
 
         # Sleep to avoid IP ban
         time.sleep(SLEEP)
@@ -111,12 +70,8 @@ def main():
     """
     Main program function
     """
-    print("[*] Scraping all pages of KYM...")
-    phrases = scrape_pages()
-
-    print("[+] Found {} phrases, writing to {}..."
-          .format(len(phrases), OUTFILE))
-    write_final(phrases)
+    print("[*] Scraping all pages of KYM...", file=sys.stderr)
+    scrape_pages()
 
 
 if __name__ == "__main__":

diff --git a/utilities/updating-sources.md b/utilities/updating-sources.md
@@ -7,30 +7,28 @@ Some of the source files get regular updates. Below is a guide to obtaining thos
 ## IMDB titles
 
 ```
-wget https://datasets.imdbws.com/title.basics.tsv.gz
-gunzip ./title.basics.tsv.gz
-cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt
-rm title.basics.tsv
+curl https://datasets.imdbws.com/title.basics.tsv.gz \
+    | gunzip \
+    | awk -F '\t' '{print $3}' \
+    > ./imdb-titles-$(date +%Y-%m-%d).txt
 ```
 
 ## Wikipedia article titles & category names
 
 ```
-wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
-gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2
-cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt
-rm enwiki-latest-pages-articles-multistream-index.txt
-
+curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2 \
+    | bzip2 \
+    | cut -d: -f 3 \
+    > ./wikipedia-$(date +%Y-%m-%d).txt
 ```
 
 ## Wiktionary titles
 
 ```
-wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz
-gunzip enwiktionary-latest-all-titles.gz
-cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt
-rm enwiktionary-latest-all-titles
-
+curl https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz \
+    | gunzip \
+    | awk -F '\t' '{print $2}' \
+    > ./wiktionary-$(date +%Y-%m-%d).txt
 ```
 
 ## Urban Dictionary
@@ -45,18 +43,17 @@ python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt
 ## Know Your Meme
 
 ```
-python3 /utilities/kym_scrape.py
-mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt
+python3 utilities/kym_scrape.py > know-your-meme-$(date +%Y-%m-%d).txt
 ```
 
 ## Global POI dataset
 
 ```
 wget http://download.geonames.org/export/dump/allCountries.zip
-unzip ./allCountries.zip
-cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt
+unzip -p ./allCountries.zip allCountries.txt \
+    | awk -F '\t' '{print $3}' \
+    > ./global-poi-$(date +%Y-%m-%d).txt
 rm allCountries.zip
-rm allCountries.txt
 ```
 
 ## Billboard charts
@@ -84,4 +81,4 @@ If you generate a new version and want to compare what's new you can use a comma
 
 ```
 sort new.txt old.txt | uniq -u
-```
+```