idefasoft · ichigirl · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+github_repos.db
+github_repos.db-shm
+github_repos.db-wal
+__pycache__/
+venv/
diff --git a/crawler.py b/crawler.py
@@ -5,17 +5,14 @@
 import subprocess
 import time
 from datetime import datetime, timedelta, timezone
-from typing import Callable, List
+from typing import List
 
 import requests
 
 from db import RepoDB
 
 TOKEN = "replace this"
-MIN_STARS = 1_000
-DB_PATH = "github_repos.db"
-LIVE_DB_PATH = "repos.db"
-PM2_APP_NAME = "git_leaderboard"
+
 GRAPHQL_QUERY = """
 query($queryString: String!, $cursor: String) {
   rateLimit {
@@ -126,163 +123,128 @@ def execute_query(self, query: str, cursor: str = None):
 
         raise Exception("Max retries exceeded.")
 
-
-def deploy_site(crawled_db_path: str):
-    log("Preparing deployment...")
-
-    db = RepoDB(crawled_db_path)
-
-    row = db.conn.execute("SELECT COUNT(*) AS cnt FROM repo_latest").fetchone()
-    total_repos = int(row["cnt"]) if row else 0
-    formatted_total = "{:,}".format(total_repos)
-
-    lang_rows = db.conn.execute("SELECT name FROM language ORDER BY name LIMIT 5000").fetchall()
-    languages = [str(r["name"]) for r in lang_rows]
-
-    topic_sql = """
-        SELECT t.name, COUNT(rtl.repo_id) AS cnt
-        FROM topic t
-        JOIN repo_topic_latest rtl ON rtl.topic_id = t.id
-        GROUP BY t.id
-        ORDER BY cnt DESC
-        LIMIT 500
-    """
-    topic_rows = db.conn.execute(topic_sql).fetchall()
-    topics = [{"name": str(r["name"]), "count": int(r["cnt"])} for r in topic_rows]
-
-    db.close()
-
-    log("Stopping PM2 service...")
-    try:
-        subprocess.run(["pm2", "stop", PM2_APP_NAME], check=False, stdout=subprocess.DEVNULL)
-    except Exception as e:
-        log(f"Warning: PM2 stop failed (might not be running): {e}")
-
-    log("Swapping Database...")
-    if os.path.exists(crawled_db_path):
-        if os.path.exists(LIVE_DB_PATH):
-            os.remove(LIVE_DB_PATH)
-        shutil.copy2(crawled_db_path, LIVE_DB_PATH)
-
-    index_path = "index.html"
-    if os.path.exists(index_path):
-        with open(index_path, "r", encoding="utf-8") as f:
-            html_content = f.read()
-
-        html_content = re.sub(r'(id="totalRepos"[^>]*>).*?(</\w+>)', f"\\g<1>{formatted_total}\\g<2>", html_content)
-
-        with open(index_path, "w", encoding="utf-8") as f:
-            f.write(html_content)
-        log(f"Updated {index_path} with {formatted_total} repos.")
-
-    app_js_path = "static/app.js"
-    if os.path.exists(app_js_path):
-        with open(app_js_path, "r", encoding="utf-8") as f:
-            js_content = f.read()
-
-        static_data = {"languages": languages, "topics": topics}
-        injection_code1 = f"const STATIC_DATA = {json.dumps(static_data)};"
-
-        if "const STATIC_DATA =" in js_content:
-            js_content = re.sub(r"const STATIC_DATA = \{.*?\};", injection_code1, js_content, flags=re.DOTALL)
-
-        with open(app_js_path, "w", encoding="utf-8") as f:
-            f.write(js_content)
-        log(f"Updated {app_js_path} with static lists.")
-
-    log("Restarting PM2 service...")
-    try:
-        subprocess.run(["pm2", "restart", PM2_APP_NAME], check=False, stdout=subprocess.DEVNULL)
-    except Exception as e:
-        log(f"Warning: PM2 restart failed (maybe not running): {e}")
-    log("Deployment complete.")
-
-
-def crawl():
-    gh = GithubGraphQL(TOKEN)
-    db = RepoDB(DB_PATH)
-    current_min_stars = MIN_STARS
-    total_fetched = 0
-
-    log(f"Starting crawl for repos with >= {MIN_STARS} stars...")
-
-    while True:
-        search_query = f"stars:>={current_min_stars} sort:stars-asc"
-        log(f"Querying batch: '{search_query}'")
-
-        cursor = None
-        batch_repos = []
-        has_next_page = True
-
-        #  Max 1000 results allowed by GitHub
-        while has_next_page:
-            time.sleep(0.1)
-            data = gh.execute_query(search_query, cursor)
-            search_data = data["search"]
-
-            nodes = search_data["nodes"]
-            if not nodes:
-                break
-
-            batch_repos.extend(nodes)
-            total_fetched += len(nodes)
-
-            log(f"  Fetched {len(nodes)} items. Total: {total_fetched}. Last star count: {nodes[-1]['stargazerCount']}")
-
-            page_info = search_data["pageInfo"]
-            has_next_page = page_info["hasNextPage"]
-            cursor = page_info["endCursor"]
-
-            if len(batch_repos) >= 1000:
-                break
-
-        if not batch_repos:
-            log("No more results found.")
-            break
-
-        last_repo_stars = batch_repos[-1]["stargazerCount"]
-
-        if last_repo_stars == current_min_stars:
-            current_min_stars += 1
-        else:
-            current_min_stars = last_repo_stars
-        db.upsert_from_github_nodes(batch_repos)
-    db.close()
-    deploy_site(DB_PATH)
-
-
-def run_at_hours(func: Callable, hours_list: List[int]):
-    now = datetime.now(tz=timezone.utc)
-    current_h = now.hour
-
-    if current_h in hours_list:
-        minutes_remaining = 0
-    else:
-        sorted_hours = sorted(hours_list)
-        next_hour = next((h for h in sorted_hours if h > current_h), sorted_hours[0])
-
-        target = now.replace(hour=next_hour, minute=0, second=0, microsecond=0)
-        if next_hour <= current_h:
-            target += timedelta(days=1)
-
-        minutes_remaining = int((target - now).total_seconds() / 60)
-
-    log(f"Scheduler started for hours: {hours_list}")
-    log(f"Next crawl will start in approximately {minutes_remaining} minutes.")
-    last_run_hour = -1
-
-    while True:
-        current_hour = datetime.now(tz=timezone.utc).hour
-
-        if current_hour in hours_list and current_hour != last_run_hour:
-            func()
-            last_run_hour = current_hour
-
-        if current_hour not in hours_list:
-            last_run_hour = -1
-
-        time.sleep(30)
+class Crawler:
+    def __init__(
+        self,
+        token: str,
+        min_stars: int = 1_000,
+        db_path: str = "github_repos.db",
+        live_db_path: str = "repos.db",
+        pm2_app_name: str = "git_leaderboard",
+    ):
+        self.token = token
+        self.min_stars = min_stars
+        self.db_path = db_path
+        self.live_db_path = live_db_path
+        self.pm2_app_name = pm2_app_name
+
+        self.gh = GithubGraphQL(self.token)
+        self.db = RepoDB(self.db_path)
+
+        self.current_min_stars = self.min_stars
+        self.total_fetched = 0
+        # used by upsert logic to avoid duplicate processing in a run
+        self._processed_repo_ids: set[int] = set()
+
+    def log(self, *args, **kwargs):
+        """Simple timestamped logger"""
+        timestamp = datetime.now().strftime("[%d:%H:%S]")
+        print(timestamp, *args, **kwargs)
+
+    def deploy_site(self):
+        self.log("Preparing deployment...")
+
+        # reopen a fresh connection so we don't interfere with crawling
+        db = RepoDB(self.db_path)
+
+        row = db.conn.execute("SELECT COUNT(*) AS cnt FROM repo_latest").fetchone()
+        total_repos = int(row["cnt"]) if row else 0
+        formatted_total = "{:,}".format(total_repos)
+
+        lang_rows = db.conn.execute("SELECT name FROM language ORDER BY name LIMIT 5000").fetchall()
+        languages = [str(r["name"]) for r in lang_rows]
+
+        topic_sql = """
+            SELECT t.name, COUNT(rtl.repo_id) AS cnt
+            FROM topic t
+            JOIN repo_topic_latest rtl ON rtl.topic_id = t.id
+            GROUP BY t.id
+            ORDER BY cnt DESC
+            LIMIT 500
+        """
+        topic_rows = db.conn.execute(topic_sql).fetchall()
+        topics = [{"name": str(r["name"]), "count": int(r["cnt"])} for r in topic_rows]
+
+        db.close()
+
+        self.log("Stopping PM2 service...")
+        try:
+            subprocess.run(["pm2", "stop", self.pm2_app_name], check=False, stdout=subprocess.DEVNULL)
+        except Exception as e:
+            self.log(f"Warning: PM2 stop failed (might not be running): {e}")
+
+        self.log("Swapping Database...")
+        if os.path.exists(self.db_path):
+            if os.path.exists(self.live_db_path):
+                os.remove(self.live_db_path)
+            shutil.copy2(self.db_path, self.live_db_path)
+
+        index_path = "index.html"
+        if os.path.exists(index_path):
+            with open(index_path, "r", encoding="utf-8") as f:
+                html_content = f.read()
+
+            html_content = re.sub(r'(id="totalRepos"[^>]*>).*?(</\w+>)', f"\\g<1>{formatted_total}\\g<2>", html_content)
+
+            with open(index_path, "w", encoding="utf-8") as f:
+                f.write(html_content)
+            self.log(f"Updated {index_path} with {formatted_total} repos.")
+
+        app_js_path = "static/app.js"
+        if os.path.exists(app_js_path):
+            with open(app_js_path, "r", encoding="utf-8") as f:
+                js_content = f.read()
+
+            static_data = {"languages": languages, "topics": topics}
+            injection_code1 = f"const STATIC_DATA = {json.dumps(static_data)};"
+
+            if "const STATIC_DATA =" in js_content:
+                js_content = re.sub(r"const STATIC_DATA = \{.*?\};", injection_code1, js_content, flags=re.DOTALL)
+
+            with open(app_js_path, "w", encoding="utf-8") as f:
+                f.write(js_content)
+            self.log(f"Updated {app_js_path} with static lists.")
+
+        self.log("Restarting PM2 service...")
+        try:
+            subprocess.run(["pm2", "restart", self.pm2_app_name], check=False, stdout=subprocess.DEVNULL)
+        except Exception as e:
+            self.log(f"Warning: PM2 restart failed (maybe not running): {e}")
+        self.log("Deployment complete.")
+
+    def run_at_hours(self, hours: List[int]):
+        self.log(f"Starting crawler. Will run at hours: {hours}")
+        while True:
+            now = datetime.now()
+            if now.hour in hours:
+                self.log("Starting crawl cycle...")
+                try:
+                    self.crawl_and_update()
+                    self.deploy_site()
+                except Exception as e:
+                    self.log(f"Error during crawl/deploy: {e}")
+                self.log("Cycle complete. Sleeping for 1 hour.")
+                time.sleep(3600)
+            else:
+                next_run = min((h for h in hours if h > now.hour), default=hours[0] + 24)
+                next_run_time = now.replace(hour=next_run % 24, minute=0, second=0, microsecond=0)
+                if next_run_time <= now:
+                    next_run_time += timedelta(days=1)
+                sleep_seconds = (next_run_time - now).total_seconds()
+                self.log(f"Current hour {now.hour} not in target hours. Sleeping for {sleep_seconds/3600:.2f} hours until {next_run_time}.")
+                time.sleep(sleep_seconds)
 
 
 if __name__ == "__main__":
-    run_at_hours(crawl, [0, 6, 12, 18])
+    crawler = Crawler(TOKEN)
+    crawler.run_at_hours([0, 6, 12, 18])