diff --git a/README.md b/README.md index e4c6e18..db4de7b 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,11 @@ projects using ScratchGPT. Please take a look at the [simple example](./examples/simple.py) in the examples folder. +**Note:** Some examples require additional dependencies. To run all examples, install the optional dependencies: +```bash +uv sync --extra examples-dependencies +``` + ## Usage ### Training diff --git a/examples/chess.py b/examples/chess.py new file mode 100644 index 0000000..910a09b --- /dev/null +++ b/examples/chess.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 +""" +Chess Engine Training Example - Train a transformer to predict chess moves using ScratchGPT + +This script demonstrates training a GPT-style model on chess games from the Lichess database. +It downloads a collection of games in PGN format, parses them into move sequences, +and trains a transformer to continue games by predicting the next moves. + +The model learns chess patterns without knowing the rules - it just sees that certain +move sequences tend to follow others in master games from Lichess. + +Usage: + python chess.py + python chess.py -g https://database.lichess.org/blitz/lichess_db_blitz_rated_2024-01.pgn.zst +""" + +import argparse +import re +import sys +import tempfile +import time +from pathlib import Path +from urllib.parse import urlparse +from urllib.request import urlretrieve + +import torch +import zstandard as zstd +from torch.optim import AdamW + +from examples.chess_tokenizer import ChessTokenizer +from scratchgpt import ( + ScratchGPTArchitecture, + ScratchGPTConfig, + ScratchGPTTraining, + Trainer, + TransformerLanguageModel, + save_tokenizer, +) +from scratchgpt.data import create_data_source + +# Alternative: use character-level tokenization +# from scratchgpt import CharTokenizer + +# Default Lichess database file +DEFAULT_LICHESS_URL: str = "https://database.lichess.org/standard/lichess_db_standard_rated_2016-02.pgn.zst" +GAME_PREVIEW_MAX_LENGTH: int = 80 + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Train a chess move predictor using ScratchGPT") + parser.add_argument( + "-g", + "--game-url", + type=str, + default=DEFAULT_LICHESS_URL, + help=f"Lichess database URL to download (default: {DEFAULT_LICHESS_URL})", + ) + return parser.parse_args() + + +class ChessDataLoader: + """Handles downloading and parsing of Lichess chess databases.""" + + def __init__(self, game_url: str) -> None: + self.game_url = game_url + + def download_and_parse(self) -> str: + """Download, decompress, and parse chess games into clean move sequences.""" + with tempfile.TemporaryDirectory() as tmp_dir: + temp_path = Path(tmp_dir) + print(f"Working in temporary directory: {temp_path}") + pgn_file = self._download_and_decompress(temp_path) + games_text = self._parse_pgn_to_games(pgn_file) + return games_text + + def _download_and_decompress(self, temp_dir: Path) -> Path: + """Download and decompress the Lichess database file.""" + filename = Path(urlparse(self.game_url).path).name + compressed_file = temp_dir / filename + + print(f"Downloading: {filename}") + print("This may take several minutes depending on file size...") + urlretrieve(self.game_url, compressed_file) + + pgn_file = temp_dir / filename.replace(".zst", "") + print(f"Decompressing: {filename}") + + dctx = zstd.ZstdDecompressor() + with open(compressed_file, "rb") as compressed_fp, open(pgn_file, "wb") as output_fp: + dctx.copy_stream(compressed_fp, output_fp) + + # Remove compressed file to save space + compressed_file.unlink() + return pgn_file + + def _parse_pgn_to_games(self, pgn_file: Path) -> str: + """Parse PGN file and extract move sequences.""" + print(f"Parsing games from: {pgn_file.name}") + + games = [] + current_game_lines = [] + games_processed = 0 + + with open(pgn_file, encoding="utf-8", errors="ignore") as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + + if line_num % 1_000_000 == 0: + print(f"Processed {line_num:,} lines, found {games_processed:,} games") + if line.startswith("["): + continue + if not line: + continue + + current_game_lines.append(line) + + if any(result in line for result in ["1-0", "0-1", "1/2-1/2", "*"]): + game_text = " ".join(current_game_lines).strip() + clean_text = self._clean_game_text(game_text) + # Only keep games with more than 2 moves + has_many_moves = len(clean_text.split()) > 2 + if has_many_moves: + games.append(clean_text) + games_processed += 1 + + # Reset for next game + current_game_lines = [] + + print(f"Extracted {len(games)} valid games") + return "\n".join(games) + + def _clean_game_text(self, game_text: str) -> str: + """Clean annotations and comments from game text.""" + # Remove comments in curly braces + game_text = re.sub(r"\{[^}]*\}", " ", game_text) + + # Remove evaluation annotations like [%eval 0.5] + game_text = re.sub(r"\[%[^\]]*\]", " ", game_text) + + # Clean up multiple spaces + game_text = re.sub(r"\s+", " ", game_text).strip() + + # Remove game results from the end + for result in ["1-0", "0-1", "1/2-1/2", "*"]: + suffix = f" {result}" + if game_text.endswith(suffix): + game_text = game_text.removesuffix(suffix) + break + return game_text + + +def create_chess_config(tokenizer_vocab_size: int) -> ScratchGPTConfig: + """Create a configuration optimized for chess move prediction.""" + # Chess-optimized architecture + architecture = ScratchGPTArchitecture( + block_size=256, # Longer context for chess games (can see ~60-80 moves) + embedding_size=384, # Balanced size for chess vocabulary + num_heads=8, # Good attention for chess patterns + num_blocks=6, # Sufficient depth for chess understanding + vocab_size=tokenizer_vocab_size, + ) + + # Training config optimized for chess patterns + training = ScratchGPTTraining( + max_epochs=15, # Chess patterns learn faster than language + learning_rate=3e-4, # Standard rate works well for chess + batch_size=32, # Good balance for chess sequences + dropout_rate=0.1, # Lower dropout for structured chess patterns + random_seed=1337, + iteration_type="chunking", + ) + + return ScratchGPTConfig(architecture=architecture, training=training) + + +def generate_chess_moves( + device: torch.device, + model: TransformerLanguageModel, + tokenizer, + game_start: str, + max_moves: int = 8, + temperature: float = 0.8, +) -> str: + """ + Generate chess moves one at a time. + + Uses moderate temperature to balance chess-like patterns with some creativity. + """ + model.eval() + + current_game = game_start + + with torch.no_grad(): + for _ in range(max_moves): + # Encode current game state + context = torch.tensor(tokenizer.encode(current_game)).unsqueeze(0).to(device) + + # Generate tokens for one move (typically 4-6 tokens) + context = model.generate(context=context, max_new_tokens=6, temperature=temperature) + current_game = tokenizer.decode(context[0].tolist()) + + return current_game + + +def main() -> None: + print("Chess Move Prediction Training with ScratchGPT") + print("=" * 60) + + # Parse arguments + args = parse_args() + + # Step 1: Download and parse chess data + print("\n--- Downloading and Parsing Chess Games ---") + data_loader = ChessDataLoader(args.game_url) + games_text = data_loader.download_and_parse() + + if not games_text.strip(): + print("ERROR: No games were parsed successfully!") + sys.exit(1) + + # Show sample of parsed games + sample_games = games_text.split("\n")[:3] + print("\nSample parsed games:") + for i, game in enumerate(sample_games, 1): + preview = game[:GAME_PREVIEW_MAX_LENGTH] + "..." if len(game) > GAME_PREVIEW_MAX_LENGTH else game + print(f"{i}: {preview}") + + # Step 2: Setup tokenizer + print("\n--- Creating Chess Tokenizer ---") + tokenizer = ChessTokenizer() + print(f"Chess vocabulary size: {tokenizer.vocab_size:,}") + + # Alternative approach using character-level tokenization: + # tokenizer = CharTokenizer(text=games_text) + # print(f"Character vocabulary size: {tokenizer.vocab_size}") + # + # Trade-offs: + # - ChessTokenizer: Domain-specific, understands chess moves as units (~10k vocab) + # - CharTokenizer: General, treats chess as character sequences (~60 vocab) + # - ChessTokenizer should learn chess patterns more efficiently + + # Step 3: Create chess-optimized configuration + print("\n--- Creating Chess Model Configuration ---") + config = create_chess_config(tokenizer.vocab_size) + print( + f"Model configuration: {config.architecture.embedding_size}D embeddings, " + f"{config.architecture.num_blocks} blocks, {config.architecture.num_heads} heads" + ) + # Step 4: Setup device and model + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"\nUsing device: {device}") + + if device.type == "cpu": + print("⚠️ WARNING: Training on CPU will be slow!") + print(" Expected time: 1-2 hours per epoch") + response = input("Continue? (y/N): ") + if response.lower() != "y": + sys.exit(1) + + model = TransformerLanguageModel(config) + model = model.to(device) + total_params = sum(p.numel() for p in model.parameters()) + print(f"Model parameters: {total_params:,}") + + # Step 5: Setup training + optimizer = AdamW(model.parameters(), lr=config.training.learning_rate, betas=(0.9, 0.95), weight_decay=0.01) + + # Create temporary file for chess games and data source + with tempfile.TemporaryDirectory() as tmp_dir: + temp_path = Path(tmp_dir) + chess_games_file = temp_path / "chess_games.txt" + + # Save parsed games to file + with open(chess_games_file, "w", encoding="utf-8") as f: + f.write(games_text) + + # Create data source using ScratchGPT's standard approach + data_source = create_data_source(str(chess_games_file)) + + # Create experiment directory + experiment_dir = temp_path / "chess_experiment" + + # Create trainer + trainer = Trainer( + model=model, config=config.training, optimizer=optimizer, experiment_path=experiment_dir, device=device + ) + + # Save tokenizer + save_tokenizer(experiment_dir, tokenizer) + + # Step 6: Training + print("\n--- Starting Chess Training ---") + print("The model will learn to predict chess moves based on grandmaster games") + print("Press Ctrl-C to stop training early and proceed to move generation demo") + + start_time = time.time() + + try: + trainer.train(data_source=data_source, tokenizer=tokenizer) + print(f"\n✅ Training completed in {time.time() - start_time:.1f} seconds") + except KeyboardInterrupt: + print(f"\n⚠️ Training interrupted after {time.time() - start_time:.1f} seconds") + print("Proceeding with chess move generation demo...") + + # Step 7: Chess Move Generation Demo + print("\n--- Chess Move Generation Demo ---") + model.eval() + + # Test with famous chess openings + test_positions = [ + "1. e4 e5 2. Nf3", # Italian Game start + "1. d4 d5 2. c4", # Queen's Gambit + "1. e4 c5", # Sicilian Defense + "1. Nf3 Nf6 2. c4", # English Opening + "1. e4 e6 2. d4", # French Defense + ] + + print("Generating continuations for famous chess openings:") + print("=" * 70) + + for position in test_positions: + print(f"\nPosition: {position}") + print("-" * 50) + + # Generate continuation + continuation = generate_chess_moves( + device=device, model=model, tokenizer=tokenizer, game_start=position + " ", max_moves=6, temperature=0.8 + ) + + # Extract generated part + generated_part = continuation[len(position) :].strip() + + # Show first several moves of continuation + generated_moves = generated_part.split()[:12] # Show ~6 moves + if generated_moves: + print(f"Continuation: {' '.join(generated_moves)}") + else: + print("Generated: (no valid continuation)") + + print("\n" + "=" * 70) + print("Chess move prediction training complete!") + print("\nWhat the model learned:") + print("- Chess move patterns from thousands of grandmaster games") + print("- Common responses to popular openings") + print("- Typical piece development and tactical motifs") + print("- The model doesn't know chess rules, just statistical patterns!") + + print(f"\nExperiment saved temporarily to: {experiment_dir}") + print("All files will be cleaned up when the script exits.") + + +if __name__ == "__main__": + main() diff --git a/examples/chess_tokenizer.py b/examples/chess_tokenizer.py new file mode 100644 index 0000000..c521468 --- /dev/null +++ b/examples/chess_tokenizer.py @@ -0,0 +1,163 @@ +import json +from pathlib import Path +from typing import Self, override + +from scratchgpt.tokenizer.base_tokenizer import SerializableTokenizer, register_tokenizer + + +@register_tokenizer("ChessTokenizer") +class ChessTokenizer(SerializableTokenizer): + """ + A deterministic tokenizer for chess games in Standard Algebraic Notation (SAN). + + This tokenizer uses a pre-generated, fixed vocabulary that covers all + syntactically valid SAN moves, move numbers, and special PGN symbols. + It does not learn from data but is designed to be comprehensive for the + domain of chess. + """ + + def __init__(self, vocab: list[str] | None = None) -> None: + if vocab is not None: + self._vocabulary = vocab + else: + self._vocabulary = self._create_vocabulary() + + self._encoding_mapping = {token: i for i, token in enumerate(self._vocabulary)} + self._decoding_mapping = dict(enumerate(self._vocabulary)) + + @staticmethod + def _create_vocabulary() -> list[str]: + """Generates the complete, deterministic vocabulary for chess.""" + # Control and special tokens + tokens = {"[PAD]", "[UNK]", "[BOS]", "[EOS]", "*", "+", "#"} + tokens.add("O-O") # Kingside castling + tokens.add("O-O-O") # Queenside castling + + # Move numbers (1. to 300. and 1... to 300...) + for i in range(1, 301): + tokens.add(f"{i}.") + # not adding tokens for move fragments + # tokens.add(f"{i}...") + + pieces = ["N", "B", "R", "Q", "K"] + files = ["a", "b", "c", "d", "e", "f", "g", "h"] + ranks = ["1", "2", "3", "4", "5", "6", "7", "8"] + squares = [f + r for f in files for r in ranks] + promotions = ["=Q", "=R", "=B", "=N"] + + # Pawn moves and captures + for square in squares: + tokens.add(square) # e.g., e4 + for file in files: + tokens.add(file + "x" + square) # e.g., dxe5 + + # Pawn promotions and capture-promotions + promo_ranks = {"1", "8"} + for square in squares: + if square[1] in promo_ranks: + for p_piece in promotions: + tokens.add(square + p_piece) # e.g., e8=Q + for file in files: + tokens.add(file + "x" + square + p_piece) # e.g., dxe8=Q + + # Piece moves (including disambiguation) + for piece in pieces: + for square in squares: + tokens.add(piece + square) + tokens.add(piece + "x" + square) + for file in files: + tokens.add(piece + file + square) + tokens.add(piece + file + "x" + square) + for rank in ranks: + tokens.add(piece + rank + square) + tokens.add(piece + rank + "x" + square) + + return sorted(tokens) + + @override + def encode(self, text: str) -> list[int]: + # Add spaces around check/mate symbols to ensure they are tokenized separately + processed_text = text.replace("+", " + ").replace("#", " # ") + + raw_tokens = [token for token in processed_text.split() if token] + + unk_token_id = self._encoding_mapping["[UNK]"] + return [self._encoding_mapping.get(token, unk_token_id) for token in raw_tokens] + + @override + def decode(self, encoding: list[int]) -> str: + return " ".join(self._decoding_mapping.get(idx, "?") for idx in encoding) + + @property + @override + def vocab_size(self) -> int: + return len(self._vocabulary) + + @property + @override + def vocabulary(self) -> list[str]: + return self._vocabulary + + @override + def save(self, tokenizer_path: Path) -> None: + """Saves the vocabulary and config file.""" + super().save(tokenizer_path) + vocab_file = tokenizer_path / "vocab.json" + with open(vocab_file, "w", encoding="utf-8") as f: + json.dump(self.vocabulary, f, indent=2) + + config = { + "tokenizer_type": "ChessTokenizer", + "vocab_file": "vocab.json", + } + config_path = tokenizer_path / "tokenizer_config.json" + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + + @classmethod + @override + def load(cls, tokenizer_path: Path) -> Self: + """Loads a ChessTokenizer from a directory.""" + config_path = tokenizer_path / "tokenizer_config.json" + if not config_path.is_file(): + raise FileNotFoundError(f"Tokenizer config not found at {config_path}") + + with open(config_path, encoding="utf-8") as f: + config = json.load(f) + + vocab_filename = config.get("vocab_file") + if not vocab_filename: + raise ValueError("Tokenizer config is missing 'vocab_file' key.") + + vocab_file = tokenizer_path / vocab_filename + if not vocab_file.is_file(): + raise FileNotFoundError(f"Vocabulary file not found at {vocab_file}") + + with open(vocab_file, encoding="utf-8") as f: + vocab = json.load(f) + + return cls(vocab=vocab) + + +def main() -> None: + tokenizer = ChessTokenizer() + game = """ +1. e4 d5 2. Nf3 dxe4 3. Ng5 Bf5 4. Nc3 Qd4 5. Qe2 Nf6 6. Qb5+ Nbd7 7. Qxb7 Rb8 8. Qxc7 h6 9. Nh3 Bxh3 10. gxh3 e6 +11. Qg3 Ne5 12. Bb5+ Ke7 13. Be2 g5 14. O-O h5 15. d3 g4 16. Qh4 gxh3 17. Bg5 Kd6 18. Bxf6 Rg8+ 19. Kh1 Ng4 +20. Bxd4 Rxb2 21. Qg3+ e5 22. Nxe4+ Kd5 23. Nf6+ Nxf6 24. Qxe5+ Kc6 25. Qxf6+ Bd6 26. Bxb2 Rg2 27. Qf3+ Kc5 28. d4+ Kb4 +29. c3+ Ka4 30. Bd1+ Kb5 31. a4+ Kc4 32. Qd3+ Kd5 33. Bf3+ Ke6 34. Bxg2 hxg2+ 35. Kxg2 h4 36. Qe4+ Kd7 37. f4 f5 +38. Qxf5+ Kc6 39. d5+ Kc5 40. Qd3 Kb6 41. Qd4+ Bc5 42. Qc4 a5 43. Ba3 Bxa3 44. Qb5+ Kc7 45. Rxa3 Kd6 46. c4 Ke7 +47. Qc6 Kf7 48. Re3 +""".strip().replace("\n", " ") + + print(f"{tokenizer.vocab_size=}") + print(f"{game=}") + tokens = tokenizer.encode(game) + print(f"{tokens[:10]=}") + + decoded_game = tokenizer.decode(tokens) + print(f"{decoded_game=}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 087665d..90b4605 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,9 @@ hf-tokenizers = [ "tokenizers>=0.19.0", "huggingface-hub>=0.34.4", ] +examples-dependencies = [ + "zstandard" +] [dependency-groups] diff --git a/scratchgpt/training/tokenize_utils.py b/scratchgpt/training/tokenize_utils.py index a92dbf1..63d2bbf 100644 --- a/scratchgpt/training/tokenize_utils.py +++ b/scratchgpt/training/tokenize_utils.py @@ -87,12 +87,12 @@ def prepare_dataset_for_training( class SlidingWindowDataset(TorchDataset[dict[str, Tensor]]): def __init__( - self, - hf_dataset: HFDataset, - tokenizer: Tokenizer, - block_size: int, - text_column: str, - ) -> None: + self, + hf_dataset: HFDataset, + tokenizer: Tokenizer, + block_size: int, + text_column: str, + ) -> None: super().__init__() self.block_size = block_size diff --git a/tests/data/test_datasource.py b/tests/data/test_datasource.py index 9be7f62..9193b30 100644 --- a/tests/data/test_datasource.py +++ b/tests/data/test_datasource.py @@ -114,12 +114,7 @@ def multiline_text_file(tmp_path: Path) -> Path: """Creates a text file with multiple lines for proper splitting.""" data_path = tmp_path / "multiline.txt" # Each line becomes a separate sample in the dataset - data_path.write_text( - "0123456789abcdef\n" - "0123456789abcdef\n" - "0123456789abcdef\n" - "0123456789abcdef" - ) + data_path.write_text("0123456789abcdef\n0123456789abcdef\n0123456789abcdef\n0123456789abcdef") return data_path diff --git a/tests/examples/test_chess_tokenizer.py b/tests/examples/test_chess_tokenizer.py new file mode 100644 index 0000000..f4b04fb --- /dev/null +++ b/tests/examples/test_chess_tokenizer.py @@ -0,0 +1,214 @@ +import json +from pathlib import Path + +import pytest + +from examples.chess_tokenizer import ChessTokenizer +from scratchgpt.model_io import save_tokenizer + + +def test_save_and_load_happy_path(tmp_path: Path): + """Tests standard saving and loading of a ChessTokenizer.""" + original_tokenizer = ChessTokenizer() + tokenizer_dir = tmp_path / "experiment" + + save_tokenizer(tokenizer_dir, original_tokenizer) + + # Use the class's own .load() method for a direct unit test + loaded_tokenizer = ChessTokenizer.load(tokenizer_dir / "tokenizer") + + assert isinstance(loaded_tokenizer, ChessTokenizer) + assert loaded_tokenizer.vocabulary == original_tokenizer.vocabulary + assert loaded_tokenizer.vocab_size == original_tokenizer.vocab_size + + +def test_chess_move_encoding_and_decoding(): + """Tests encoding and decoding of various chess moves.""" + tokenizer = ChessTokenizer() + + # Test basic moves + basic_moves = "1. e4 e5 2. Nf3 Nc6" + encoded = tokenizer.encode(basic_moves) + decoded = tokenizer.decode(encoded) + assert decoded == basic_moves + + # Test captures + captures = "1. e4 d5 2. exd5 Qxd5" + encoded = tokenizer.encode(captures) + decoded = tokenizer.decode(encoded) + assert decoded == captures + + # Test check and checkmate + check_mate = "1. e4 e5 2. Qh5 Nc6 3. Bc4 Nf6 4. Qxf7+ Ke7 5. Qf3#" + # The tokenizer should handle + and # by adding spaces around them + encoded = tokenizer.encode(check_mate) + decoded = tokenizer.decode(encoded) + expected = "1. e4 e5 2. Qh5 Nc6 3. Bc4 Nf6 4. Qxf7 + Ke7 5. Qf3 #" + assert decoded == expected + + +def test_castling_moves(): + """Tests that castling moves are properly tokenized.""" + tokenizer = ChessTokenizer() + + # These should be in the vocabulary for the tokenizer to work properly + # If they're not, the test will fail and indicate what needs to be fixed + castling_game = "1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 O-O" + + encoded = tokenizer.encode(castling_game) + decoded = tokenizer.decode(encoded) + + # Check that no [UNK] tokens were generated + assert "[UNK]" not in decoded + + # The moves should decode properly (castling might have spaces added around them) + # This test will reveal if castling moves are missing from vocabulary + tokens = decoded.split() + assert "O-O" in tokens or "O - O" in tokens # Either form should work + + +def test_promotion_moves(): + """Tests pawn promotion moves.""" + tokenizer = ChessTokenizer() + + promotion_moves = """ +1. e4 e5 2. d4 exd4 3. c3 dxc3 4. bxc3 d6 5. Nf3 Bg4 6. Be2 Nc6 7. O-O Qd7 8. Re1 O-O-O 9. Nd4 Bxe2 10. Qxe2 Nxd4 +11. cxd4 Kb8 12. Nc3 f5 13. exf5 Qxf5 14. Qe8+ Rxe8 15. Rxe8+ Kc7 16. d5 Qxf2+ 17. Kh1 Qf1+ 18. Bg1 Ne7 19. Rae1 Nxd5 +20. Nxd5+ Kd7 21. Re7+ Kd8 22. Rxg7 Qxe1 23. Rxg1 Qe2 24. Rg8+ Ke8 25. Nf6+ Kf7 26. Nh5+ Kf8 27. Rg3 Qe1 +28. Rf3+ Kg8 29. Nf6+ Kh8 30. Nxh7 Qe8+ 31. Rf8 Qxf8+ 32. Nxf8 a5 33. Nd7 a4 34. Nxc5 dxc5 35. a3 c4 36. Kg2 c3 +37. Kf3 c2 38. Ke2 c1=Q""".strip().replace("\n", " ") + + encoded = tokenizer.encode(promotion_moves) + decoded = tokenizer.decode(encoded) + + # Should not have unknown tokens + assert "[UNK]" not in decoded + + # Should contain promotion notation + assert "c1=Q" in decoded or "c1 = Q" in decoded + + +def test_unknown_token_handling(): + """Tests handling of unknown tokens.""" + tokenizer = ChessTokenizer() + + # Test with some invalid chess notation + invalid_moves = "1. e4 xyz 2. Nf3 invalid_move" + encoded = tokenizer.encode(invalid_moves) + decoded = tokenizer.decode(encoded) + + # Unknown tokens should be replaced with [UNK] in encoding + unk_id = tokenizer._encoding_mapping["[UNK]"] + assert unk_id in encoded + + # Decoding should show the [UNK] token for unknown moves + assert "[UNK]" in decoded + + +def test_special_tokens(): + """Tests that special tokens are in vocabulary.""" + tokenizer = ChessTokenizer() + vocab = tokenizer.vocabulary + + # Check that special tokens exist + expected_special = ["[PAD]", "[UNK]", "[BOS]", "[EOS]", "*", "+", "#"] + for token in expected_special: + assert token in vocab, f"Special token {token} missing from vocabulary" + + +def test_vocabulary_size(): + """Tests vocabulary size is reasonable.""" + tokenizer = ChessTokenizer() + vocab_size = tokenizer.vocab_size + + # Chess vocabulary should be substantial but not excessive + # This is a sanity check - adjust bounds if needed + assert 1000 < vocab_size < 50000, f"Vocabulary size {vocab_size} seems unreasonable" + + # All vocabulary items should be unique + assert len(tokenizer.vocabulary) == len(set(tokenizer.vocabulary)) + + +def test_save_and_load_preserves_functionality(tmp_path: Path): + """Tests that saved and loaded tokenizer works identically.""" + original_tokenizer = ChessTokenizer() + tokenizer_dir = tmp_path / "experiment" + + # Test game from the original example + test_game = "1. e4 d5 2. Nf3 dxe4 3. Ng5 Bf5 4. Nc3 Qd4" + + # Encode with original + original_encoded = original_tokenizer.encode(test_game) + original_decoded = original_tokenizer.decode(original_encoded) + + # Save and load + save_tokenizer(tokenizer_dir, original_tokenizer) + loaded_tokenizer = ChessTokenizer.load(tokenizer_dir / "tokenizer") + + # Encode with loaded tokenizer + loaded_encoded = loaded_tokenizer.encode(test_game) + loaded_decoded = loaded_tokenizer.decode(loaded_encoded) + + # Should be identical + assert original_encoded == loaded_encoded + assert original_decoded == loaded_decoded + + +def test_load_error_missing_vocab_file(tmp_path: Path): + """Tests that ChessTokenizer.load() fails if vocab.json is missing.""" + tokenizer_dir = tmp_path / "tokenizer" + tokenizer_dir.mkdir() + + # Manually create only the config file, but not the vocab file + config = {"tokenizer_type": "ChessTokenizer", "vocab_file": "vocab.json"} + with open(tokenizer_dir / "tokenizer_config.json", "w") as f: + json.dump(config, f) + + with pytest.raises(FileNotFoundError, match="Vocabulary file not found"): + ChessTokenizer.load(tokenizer_dir) + + +def test_load_error_missing_config_file(tmp_path: Path): + """Tests that ChessTokenizer.load() fails if config file is missing.""" + tokenizer_dir = tmp_path / "tokenizer" + tokenizer_dir.mkdir() + + with pytest.raises(FileNotFoundError, match="Tokenizer config not found"): + ChessTokenizer.load(tokenizer_dir) + + +def test_load_error_malformed_config(tmp_path: Path): + """Tests that ChessTokenizer.load() fails if config is malformed.""" + tokenizer_dir = tmp_path / "tokenizer" + tokenizer_dir.mkdir() + + # Create config without vocab_file key + config = {"tokenizer_type": "ChessTokenizer"} + with open(tokenizer_dir / "tokenizer_config.json", "w") as f: + json.dump(config, f) + + with pytest.raises(ValueError, match="missing 'vocab_file' key"): + ChessTokenizer.load(tokenizer_dir) + + +def test_real_chess_game_compatibility(): + """Tests with a real chess game similar to what preprocessing would produce.""" + tokenizer = ChessTokenizer() + + # This is similar to what our chess.py preprocessing would produce + real_game = """ +1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3 O-O 9. h3 Nb8 10. d4 Nbd7 11. c4 c6 +12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15. Nb1 h6 16. Bh4 c5 17. dxe5 Nxe5 18. Nxe5 dxe5 19. Bxf6 Bxf6 20. Nd2 c4 +21. Bc2 Qc7 22. Ne4 Be7 23. Qd4 Rfd8 24. Qxe5 Qxe5 25. Nxe5 +""".strip().replace("\n", " ") + + encoded = tokenizer.encode(real_game) + decoded = tokenizer.decode(encoded) + + # Should encode without unknown tokens + unk_count = sum(1 for token_id in encoded if tokenizer._decoding_mapping.get(token_id) == "[UNK]") + assert unk_count == 0, f"Found {unk_count} unknown tokens in real game" + + # Should be able to round-trip + assert len(encoded) > 0 + assert len(decoded) > 0 diff --git a/uv.lock b/uv.lock index a5aab7d..b6cb779 100644 --- a/uv.lock +++ b/uv.lock @@ -1153,7 +1153,7 @@ wheels = [ [[package]] name = "scratchgpt" -version = "0.5.0" +version = "0.5.1" source = { editable = "." } dependencies = [ { name = "datasets" }, @@ -1167,6 +1167,9 @@ dependencies = [ ] [package.optional-dependencies] +examples-dependencies = [ + { name = "zstandard" }, +] hf-tokenizers = [ { name = "huggingface-hub" }, { name = "tokenizers" }, @@ -1194,8 +1197,9 @@ requires-dist = [ { name = "torch", specifier = ">=2.8.0" }, { name = "tqdm", specifier = ">=4.67.1" }, { name = "types-tqdm", specifier = ">=4.67.0.20250809" }, + { name = "zstandard", marker = "extra == 'examples-dependencies'" }, ] -provides-extras = ["hf-tokenizers"] +provides-extras = ["hf-tokenizers", "examples-dependencies"] [package.metadata.requires-dev] dev = [ @@ -1504,3 +1508,60 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" }, + { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" }, + { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" }, + { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" }, + { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" }, + { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" }, + { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" }, + { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" }, + { url = "https://files.pythonhosted.org/packages/35/0b/8df9c4ad06af91d39e94fa96cc010a24ac4ef1378d3efab9223cc8593d40/zstandard-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec996f12524f88e151c339688c3897194821d7f03081ab35d31d1e12ec975e94", size = 795735, upload-time = "2025-09-14T22:17:26.042Z" }, + { url = "https://files.pythonhosted.org/packages/3f/06/9ae96a3e5dcfd119377ba33d4c42a7d89da1efabd5cb3e366b156c45ff4d/zstandard-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a1a4ae2dec3993a32247995bdfe367fc3266da832d82f8438c8570f989753de1", size = 640440, upload-time = "2025-09-14T22:17:27.366Z" }, + { url = "https://files.pythonhosted.org/packages/d9/14/933d27204c2bd404229c69f445862454dcc101cd69ef8c6068f15aaec12c/zstandard-0.25.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:e96594a5537722fdfb79951672a2a63aec5ebfb823e7560586f7484819f2a08f", size = 5343070, upload-time = "2025-09-14T22:17:28.896Z" }, + { url = "https://files.pythonhosted.org/packages/6d/db/ddb11011826ed7db9d0e485d13df79b58586bfdec56e5c84a928a9a78c1c/zstandard-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bfc4e20784722098822e3eee42b8e576b379ed72cca4a7cb856ae733e62192ea", size = 5063001, upload-time = "2025-09-14T22:17:31.044Z" }, + { url = "https://files.pythonhosted.org/packages/db/00/87466ea3f99599d02a5238498b87bf84a6348290c19571051839ca943777/zstandard-0.25.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:457ed498fc58cdc12fc48f7950e02740d4f7ae9493dd4ab2168a47c93c31298e", size = 5394120, upload-time = "2025-09-14T22:17:32.711Z" }, + { url = "https://files.pythonhosted.org/packages/2b/95/fc5531d9c618a679a20ff6c29e2b3ef1d1f4ad66c5e161ae6ff847d102a9/zstandard-0.25.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:fd7a5004eb1980d3cefe26b2685bcb0b17989901a70a1040d1ac86f1d898c551", size = 5451230, upload-time = "2025-09-14T22:17:34.41Z" }, + { url = "https://files.pythonhosted.org/packages/63/4b/e3678b4e776db00f9f7b2fe58e547e8928ef32727d7a1ff01dea010f3f13/zstandard-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8e735494da3db08694d26480f1493ad2cf86e99bdd53e8e9771b2752a5c0246a", size = 5547173, upload-time = "2025-09-14T22:17:36.084Z" }, + { url = "https://files.pythonhosted.org/packages/4e/d5/ba05ed95c6b8ec30bd468dfeab20589f2cf709b5c940483e31d991f2ca58/zstandard-0.25.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3a39c94ad7866160a4a46d772e43311a743c316942037671beb264e395bdd611", size = 5046736, upload-time = "2025-09-14T22:17:37.891Z" }, + { url = "https://files.pythonhosted.org/packages/50/d5/870aa06b3a76c73eced65c044b92286a3c4e00554005ff51962deef28e28/zstandard-0.25.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:172de1f06947577d3a3005416977cce6168f2261284c02080e7ad0185faeced3", size = 5576368, upload-time = "2025-09-14T22:17:40.206Z" }, + { url = "https://files.pythonhosted.org/packages/5d/35/398dc2ffc89d304d59bc12f0fdd931b4ce455bddf7038a0a67733a25f550/zstandard-0.25.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3c83b0188c852a47cd13ef3bf9209fb0a77fa5374958b8c53aaa699398c6bd7b", size = 4954022, upload-time = "2025-09-14T22:17:41.879Z" }, + { url = "https://files.pythonhosted.org/packages/9a/5c/36ba1e5507d56d2213202ec2b05e8541734af5f2ce378c5d1ceaf4d88dc4/zstandard-0.25.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1673b7199bbe763365b81a4f3252b8e80f44c9e323fc42940dc8843bfeaf9851", size = 5267889, upload-time = "2025-09-14T22:17:43.577Z" }, + { url = "https://files.pythonhosted.org/packages/70/e8/2ec6b6fb7358b2ec0113ae202647ca7c0e9d15b61c005ae5225ad0995df5/zstandard-0.25.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0be7622c37c183406f3dbf0cba104118eb16a4ea7359eeb5752f0794882fc250", size = 5433952, upload-time = "2025-09-14T22:17:45.271Z" }, + { url = "https://files.pythonhosted.org/packages/7b/01/b5f4d4dbc59ef193e870495c6f1275f5b2928e01ff5a81fecb22a06e22fb/zstandard-0.25.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5f5e4c2a23ca271c218ac025bd7d635597048b366d6f31f420aaeb715239fc98", size = 5814054, upload-time = "2025-09-14T22:17:47.08Z" }, + { url = "https://files.pythonhosted.org/packages/b2/e5/fbd822d5c6f427cf158316d012c5a12f233473c2f9c5fe5ab1ae5d21f3d8/zstandard-0.25.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f187a0bb61b35119d1926aee039524d1f93aaf38a9916b8c4b78ac8514a0aaf", size = 5360113, upload-time = "2025-09-14T22:17:48.893Z" }, + { url = "https://files.pythonhosted.org/packages/8e/e0/69a553d2047f9a2c7347caa225bb3a63b6d7704ad74610cb7823baa08ed7/zstandard-0.25.0-cp313-cp313-win32.whl", hash = "sha256:7030defa83eef3e51ff26f0b7bfb229f0204b66fe18e04359ce3474ac33cbc09", size = 436936, upload-time = "2025-09-14T22:17:52.658Z" }, + { url = "https://files.pythonhosted.org/packages/d9/82/b9c06c870f3bd8767c201f1edbdf9e8dc34be5b0fbc5682c4f80fe948475/zstandard-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:1f830a0dac88719af0ae43b8b2d6aef487d437036468ef3c2ea59c51f9d55fd5", size = 506232, upload-time = "2025-09-14T22:17:50.402Z" }, + { url = "https://files.pythonhosted.org/packages/d4/57/60c3c01243bb81d381c9916e2a6d9e149ab8627c0c7d7abb2d73384b3c0c/zstandard-0.25.0-cp313-cp313-win_arm64.whl", hash = "sha256:85304a43f4d513f5464ceb938aa02c1e78c2943b29f44a750b48b25ac999a049", size = 462671, upload-time = "2025-09-14T22:17:51.533Z" }, + { url = "https://files.pythonhosted.org/packages/3d/5c/f8923b595b55fe49e30612987ad8bf053aef555c14f05bb659dd5dbe3e8a/zstandard-0.25.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e29f0cf06974c899b2c188ef7f783607dbef36da4c242eb6c82dcd8b512855e3", size = 795887, upload-time = "2025-09-14T22:17:54.198Z" }, + { url = "https://files.pythonhosted.org/packages/8d/09/d0a2a14fc3439c5f874042dca72a79c70a532090b7ba0003be73fee37ae2/zstandard-0.25.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:05df5136bc5a011f33cd25bc9f506e7426c0c9b3f9954f056831ce68f3b6689f", size = 640658, upload-time = "2025-09-14T22:17:55.423Z" }, + { url = "https://files.pythonhosted.org/packages/5d/7c/8b6b71b1ddd517f68ffb55e10834388d4f793c49c6b83effaaa05785b0b4/zstandard-0.25.0-cp314-cp314-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:f604efd28f239cc21b3adb53eb061e2a205dc164be408e553b41ba2ffe0ca15c", size = 5379849, upload-time = "2025-09-14T22:17:57.372Z" }, + { url = "https://files.pythonhosted.org/packages/a4/86/a48e56320d0a17189ab7a42645387334fba2200e904ee47fc5a26c1fd8ca/zstandard-0.25.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223415140608d0f0da010499eaa8ccdb9af210a543fac54bce15babbcfc78439", size = 5058095, upload-time = "2025-09-14T22:17:59.498Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ad/eb659984ee2c0a779f9d06dbfe45e2dc39d99ff40a319895df2d3d9a48e5/zstandard-0.25.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e54296a283f3ab5a26fc9b8b5d4978ea0532f37b231644f367aa588930aa043", size = 5551751, upload-time = "2025-09-14T22:18:01.618Z" }, + { url = "https://files.pythonhosted.org/packages/61/b3/b637faea43677eb7bd42ab204dfb7053bd5c4582bfe6b1baefa80ac0c47b/zstandard-0.25.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ca54090275939dc8ec5dea2d2afb400e0f83444b2fc24e07df7fdef677110859", size = 6364818, upload-time = "2025-09-14T22:18:03.769Z" }, + { url = "https://files.pythonhosted.org/packages/31/dc/cc50210e11e465c975462439a492516a73300ab8caa8f5e0902544fd748b/zstandard-0.25.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e09bb6252b6476d8d56100e8147b803befa9a12cea144bbe629dd508800d1ad0", size = 5560402, upload-time = "2025-09-14T22:18:05.954Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ae/56523ae9c142f0c08efd5e868a6da613ae76614eca1305259c3bf6a0ed43/zstandard-0.25.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a9ec8c642d1ec73287ae3e726792dd86c96f5681eb8df274a757bf62b750eae7", size = 4955108, upload-time = "2025-09-14T22:18:07.68Z" }, + { url = "https://files.pythonhosted.org/packages/98/cf/c899f2d6df0840d5e384cf4c4121458c72802e8bda19691f3b16619f51e9/zstandard-0.25.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a4089a10e598eae6393756b036e0f419e8c1d60f44a831520f9af41c14216cf2", size = 5269248, upload-time = "2025-09-14T22:18:09.753Z" }, + { url = "https://files.pythonhosted.org/packages/1b/c0/59e912a531d91e1c192d3085fc0f6fb2852753c301a812d856d857ea03c6/zstandard-0.25.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f67e8f1a324a900e75b5e28ffb152bcac9fbed1cc7b43f99cd90f395c4375344", size = 5430330, upload-time = "2025-09-14T22:18:11.966Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/7e31db1240de2df22a58e2ea9a93fc6e38cc29353e660c0272b6735d6669/zstandard-0.25.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:9654dbc012d8b06fc3d19cc825af3f7bf8ae242226df5f83936cb39f5fdc846c", size = 5811123, upload-time = "2025-09-14T22:18:13.907Z" }, + { url = "https://files.pythonhosted.org/packages/f6/49/fac46df5ad353d50535e118d6983069df68ca5908d4d65b8c466150a4ff1/zstandard-0.25.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4203ce3b31aec23012d3a4cf4a2ed64d12fea5269c49aed5e4c3611b938e4088", size = 5359591, upload-time = "2025-09-14T22:18:16.465Z" }, + { url = "https://files.pythonhosted.org/packages/c2/38/f249a2050ad1eea0bb364046153942e34abba95dd5520af199aed86fbb49/zstandard-0.25.0-cp314-cp314-win32.whl", hash = "sha256:da469dc041701583e34de852d8634703550348d5822e66a0c827d39b05365b12", size = 444513, upload-time = "2025-09-14T22:18:20.61Z" }, + { url = "https://files.pythonhosted.org/packages/3a/43/241f9615bcf8ba8903b3f0432da069e857fc4fd1783bd26183db53c4804b/zstandard-0.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:c19bcdd826e95671065f8692b5a4aa95c52dc7a02a4c5a0cac46deb879a017a2", size = 516118, upload-time = "2025-09-14T22:18:17.849Z" }, + { url = "https://files.pythonhosted.org/packages/f0/ef/da163ce2450ed4febf6467d77ccb4cd52c4c30ab45624bad26ca0a27260c/zstandard-0.25.0-cp314-cp314-win_arm64.whl", hash = "sha256:d7541afd73985c630bafcd6338d2518ae96060075f9463d7dc14cfb33514383d", size = 476940, upload-time = "2025-09-14T22:18:19.088Z" }, +]