From f872f5cdf6d81c49337bfcd25b3b5419d2090497 Mon Sep 17 00:00:00 2001 From: yych42 <43839115+yych42@users.noreply.github.com> Date: Tue, 21 May 2024 12:08:54 +0000 Subject: [PATCH] Update OpenAI implementation and models --- src/semantra/models.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/semantra/models.py b/src/semantra/models.py index 5ff77b1..40958dd 100644 --- a/src/semantra/models.py +++ b/src/semantra/models.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod import numpy as np -import openai +from openai import OpenAI import tiktoken import torch from dotenv import load_dotenv @@ -104,7 +104,7 @@ def is_asymmetric(self): class OpenAIModel(BaseModel): def __init__( self, - model_name="text-embedding-ada-002", + model_name="text-embedding-3-small", num_dimensions=1536, tokenizer_name="cl100k_base", ): @@ -113,16 +113,15 @@ def __init__( raise Exception( "OpenAI API key not set. Please set the OPENAI_API_KEY environment variable or create a `.env` file with the key in the current working directory or the Semantra directory, which is revealed by running `semantra --show-semantra-dir`." ) - - openai.api_key = os.getenv("OPENAI_API_KEY") self.model_name = model_name self.num_dimensions = num_dimensions self.tokenizer = tiktoken.get_encoding(tokenizer_name) + self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def get_config(self): return { - "model_type": "openai", + "model_type": "openai" if self.model_name == "text-embedding-3-small" else "openai-large", "model_name": self.model_name, "tokenizer_name": self.tokenizer.name, } @@ -141,8 +140,8 @@ def get_text_chunks(self, _: str, tokens) -> "list[str]": def embed(self, tokens, offsets, _is_query=False) -> "list[list[float]]": texts = [tokens[i:j] for i, j in offsets] - response = openai.Embedding.create(model=self.model_name, input=texts) - return np.array([data["embedding"] for data in response["data"]]) + response = self.client.embeddings.create(model=self.model_name, input=texts) + return np.array([data.embedding for data in response.data]) def zero_if_none(x): @@ -314,15 +313,25 @@ def embed(self, tokens, offsets, is_query=False) -> "list[list[float]]": models = { "openai": { - "cost_per_token": 0.0004 / 1000, + "cost_per_token": 0.00002 / 1000, "pool_size": 50000, "pool_count": 2000, "get_model": lambda: OpenAIModel( - model_name="text-embedding-ada-002", + model_name="text-embedding-3-small", num_dimensions=1536, tokenizer_name="cl100k_base", ), }, + "openai-large": { + "cost_per_token": 0.00013 / 1000, + "pool_size": 50000, + "pool_count": 2000, + "get_model": lambda: OpenAIModel( + model_name="text-embedding-3-large", + num_dimensions=3072, + tokenizer_name="cl100k_base", + ), + }, "minilm": { "cost_per_token": None, "pool_size": 50000,