From d3dfe3aa280ecd2ab9a21992f09941575c70f202 Mon Sep 17 00:00:00 2001 From: John-Ge <1017457635@qq.com> Date: Mon, 28 Aug 2023 11:04:08 +0000 Subject: [PATCH 1/2] addtoken --- tools/count.py | 55 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/tools/count.py b/tools/count.py index 1d5538c..3fa6b5a 100644 --- a/tools/count.py +++ b/tools/count.py @@ -1,18 +1,51 @@ +import argparse import json from transformers import AutoTokenizer +import tiktoken -with open("/path/to/your/json", 'r') as f: - data = json.load(f) -tokenizer = AutoTokenizer.from_pretrained("/path/to/your/tokenizer/") +def count_tokenizer(json_path, model_name): + with open(json_path, 'r') as f: + data = json.load(f) -count = 0 + encoding = tiktoken.encoding_for_model(model_name) + count = 0 -for chat in data: - conversations = chat["conversations"] - for conv in conversations: - value = conv["value"] - tokenize = tokenizer.tokenize(value) - count += len(value) + for chat in data: + conversations = chat["conversations"] + for conv in conversations: + value = conv["value"] + num_tokens = len(encoding.encode(value)) + count += num_tokens + + return count + +def count_tokenizer_local(json_path, tokenizer_path): + with open(json_path, 'r') as f: + data = json.load(f) + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + + count = 0 + + for chat in data: + conversations = chat["conversations"] + for conv in conversations: + value = conv["value"] + tokenize = tokenizer.tokenize(value) + count += len(value) + + print(count) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Count the number of tokens in a JSON file using a tokenizer.') + parser.add_argument('json_path', type=str, help='Path to the JSON file.') + parser.add_argument('tokenizer_path', type=str, help='Path to the tokenizer directory or tiktoken name.') + parser.add_argument('--use-tiktoken', action='store_true', help='Use tiktoken to count tokens.') + args = parser.parse_args() + + if args.use_tiktoken: + count_tokenizer(args.json_path, args.tokenizer_path) + else: + count_tokenizer(args.json_path, args.tokenizer_path) -print(count) From 7fe1045686f9b64374b9c7152a1d1d2c2d277854 Mon Sep 17 00:00:00 2001 From: John-Ge <1017457635@qq.com> Date: Mon, 28 Aug 2023 11:12:40 +0000 Subject: [PATCH 2/2] addcount --- {tools => src/tools}/count.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {tools => src/tools}/count.py (100%) diff --git a/tools/count.py b/src/tools/count.py similarity index 100% rename from tools/count.py rename to src/tools/count.py