transcriber/transcript_processor.py at main · vtsachev/transcriber · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# transcript_processor.py

import logging
import tiktoken

def process_transcript(client, transcript_text, output_file_name):
    """
    Processes the combined transcript with GPT-4o-mini in manageable chunks and saves the final transcript.

    Parameters:
    - client: OpenAI client object
    - transcript_text: The combined transcript text.
    - output_file_name: The filename to save the final transcript.
    """
    try:
        # Your specified prompt
        prompt = """
Your task is to improve the readability of a raw transcript while preserving as much of the original wording as possible. Follow these guidelines:

1. Read the transcript carefully, sentence by sentence.
2. Identify speakers:
   - There are multiple potential speakers in this transcript.
   - Identify transitions between speakers based on:
     a) Clear changes in the topic or subject matter
     b) Shifts in speaking style, vocabulary, or tone
     c) Contextual clues indicating a new person is speaking (e.g., "Thank you for that presentation, now I have a question...")
   - Assign content to speakers as follows:
     a) For consecutive content from the same speaker, group it together.
     b) When you detect a likely speaker change, start a new paragraph and label it with a new speaker designation.
   - Label speakers sequentially as "Speaker 1", "Speaker 2", "Speaker 3", etc., in the order they appear in the transcript.
   - If a speaker returns later in the transcript, reuse their original label.
   - If you're unsure about a speaker change, err on the side of keeping the current speaker label.

3. Improve readability with minimal changes:
   - Correct only obvious grammatical errors and typos.
   - Retain original wording unless it significantly impairs understanding.
   - Keep filler words and false starts unless they severely disrupt readability.
   - Only break up extremely long, confusing sentences if necessary for comprehension.
   - Maintain the original tone, style, and vocabulary of each speaker.

4. Handle unclear or nonsensical parts:
   - If a part is completely incomprehensible, replace it with [best guess].
   - Inside the brackets, provide your best interpretation of the intended meaning.
   - Use a similar number of words and try to incorporate any recognizable parts of the original text.

5. Formatting:
   - Present the transcript in a clear, easy-to-read format.
   - Start a new paragraph for each change of speaker.

6. Prioritize authenticity:
   - Your primary goal is to preserve the original content as much as possible.
   - Make only the minimum changes necessary for basic comprehension.
   - Do not add new information, elaborate, or change the context of the conversation.
   - If in doubt, keep the original wording.

Provide the minimally-edited, more readable version of the transcript while strictly adhering to these guidelines. The output should be nearly identical to the input, with changes only where absolutely necessary for basic understanding.
"""

        # Maximum tokens for GPT-4o-mini
        MAX_CONTEXT_LENGTH = 128000
        MAX_COMPLETION_TOKENS = 16384  # Model's maximum output tokens
        MAX_INPUT_TOKENS = MAX_CONTEXT_LENGTH - MAX_COMPLETION_TOKENS

        # Initialize the tiktoken encoder
        encoding = tiktoken.encoding_for_model("gpt-4o-mini")

        # Tokenize the prompt and system message
        prompt_tokens = len(encoding.encode(prompt))
        system_message_tokens = len(encoding.encode("You are a helpful assistant."))

        # Calculate available tokens for the transcript
        available_tokens = MAX_INPUT_TOKENS - prompt_tokens - system_message_tokens - 100  # Reserve some buffer

        # Split the transcript into chunks that fit within the available tokens
        transcript_chunks = split_transcript_into_chunks(transcript_text, available_tokens, encoding)

        # Process each chunk
        final_transcript = ""
        chunk_number = 1

        for chunk in transcript_chunks:
            logging.info(f"Processing chunk {chunk_number}/{len(transcript_chunks)}")

            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt + "\n\n" + chunk}
            ]

            # Estimate the total tokens for this request
            total_request_tokens = prompt_tokens + system_message_tokens + len(encoding.encode(chunk))

            # Adjust max_tokens to prevent exceeding the context window
            max_tokens_for_completion = min(MAX_COMPLETION_TOKENS, MAX_CONTEXT_LENGTH - total_request_tokens - 100)

            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                max_tokens=max_tokens_for_completion,
                temperature=0.5
            )

            # Extract the assistant's reply
            assistant_reply = response.choices[0].message.content

            final_transcript += assistant_reply + "\n"

            chunk_number += 1

        # Save the final transcript
        with open(output_file_name, "w", encoding="utf-8") as f:
            f.write(final_transcript)
        logging.info(f"Final transcript saved to '{output_file_name}'")

    except Exception as e:
        logging.exception("An unexpected error occurred during processing with GPT-4o-mini.")
        raise

def split_transcript_into_chunks(transcript_text, max_tokens_per_chunk, encoding):
    """
    Splits the transcript text into chunks that do not exceed the max tokens per chunk.

    Parameters:
    - transcript_text: The combined transcript text.
    - max_tokens_per_chunk: Maximum number of tokens allowed per chunk.
    - encoding: The tiktoken encoding object.

    Returns:
    - List of transcript text chunks.
    """
    # Split the transcript into sentences or paragraphs
    import re
    sentences = re.split(r'(?<=[.!?])\s+', transcript_text)

    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = len(encoding.encode(sentence))

        if current_tokens + sentence_tokens <= max_tokens_per_chunk:
            current_chunk += sentence + " "
            current_tokens += sentence_tokens
        else:
            # Start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
            current_tokens = sentence_tokens

            # Handle sentences longer than max_tokens_per_chunk
            if sentence_tokens > max_tokens_per_chunk:
                logging.warning(f"A single sentence exceeds the maximum tokens per chunk: '{sentence[:50]}...'")
                # You may choose to split the sentence further or handle it accordingly

    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks