transcribrr/summarize-transcript.sh at main · nostrocket/transcribrr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/bin/bash

# Summarize a video transcript using Qwen 2.5 with MLX (Apple Silicon GPU)
# Usage: ./summarize-transcript.sh <transcript_file>
#
# Requires: Apple Silicon Mac, ~20 GB RAM for 32B 4-bit model
# Dependencies are installed automatically into .venv on first run.

set -euo pipefail

TRANSCRIPT_FILE=""
INSTALL_ONLY=false
MODEL_FLAG=""
STYLE_FLAG=""

while [[ $# -gt 0 ]]; do
    case $1 in
        --install)
            INSTALL_ONLY=true
            shift
            ;;
        --model)
            MODEL_FLAG="$2"
            shift 2
            ;;
        --style)
            STYLE_FLAG="$2"
            shift 2
            ;;
        -*)
            echo "Unknown option: $1" >&2
            exit 1
            ;;
        *)
            TRANSCRIPT_FILE="$1"
            shift
            ;;
    esac
done

# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV_DIR="$SCRIPT_DIR/.venv"
PYTHON="$VENV_DIR/bin/python"
PIP="$VENV_DIR/bin/pip"

# ── Ensure virtual environment and dependencies ──────────────────────────────

setup_venv() {
    if [ ! -d "$VENV_DIR" ]; then
        echo "Creating virtual environment at $VENV_DIR ..."
        python3 -m venv "$VENV_DIR"
    fi

    # Check if mlx-lm is installed
    if ! "$PYTHON" -c "import mlx_lm" 2>/dev/null; then
        echo "Installing mlx-lm (MLX language model framework)..."
        "$PIP" install --upgrade pip > /dev/null
        "$PIP" install mlx-lm
        echo ""
        echo "mlx-lm installed successfully."
    fi
}

setup_venv

if $INSTALL_ONLY; then
    echo ""
    echo "Dependencies installed. Pre-downloading default model..."
    "$PYTHON" -c "
from mlx_lm import load
print('Downloading Qwen2.5-32B-Instruct-4bit...')
model, tokenizer = load('mlx-community/Qwen2.5-32B-Instruct-4bit')
print('Model downloaded and ready.')
"
    echo "Done! Run: $0 <transcript_file>"
    exit 0
fi

# ── Validate input ───────────────────────────────────────────────────────────

if [ -z "$TRANSCRIPT_FILE" ] || [ ! -f "$TRANSCRIPT_FILE" ]; then
    echo "Usage: $0 <transcript_file>"
    echo "       $0 --install            # Install deps and download model"
    echo ""
    echo "Example: $0 recording_transcript.txt"
    echo ""
    echo "Summarizes a video/audio transcript using Qwen 2.5 on Apple Silicon."
    echo "Supports transcripts of any length via intelligent chunking."
    exit 1
fi

# ── Model selection ──────────────────────────────────────────────────────────

if [ -n "$MODEL_FLAG" ]; then
    if [[ "$MODEL_FLAG" == */* ]]; then
        MODEL="$MODEL_FLAG"
        # Apply same sanitization as cleanup-transcript.sh (fixes unsanitized label bug)
        MODEL_LABEL=$(echo "$MODEL" | sed 's/mlx-community\///' | sed 's/[^a-zA-Z0-9.-]/_/g' | tr '[:upper:]' '[:lower:]')
    else
        case "$MODEL_FLAG" in
            Qwen2.5-7B-4bit)  MODEL="mlx-community/Qwen2.5-7B-Instruct-4bit"  ; MODEL_LABEL="Qwen2.5-7B-4bit" ;;
            Qwen2.5-14B-4bit) MODEL="mlx-community/Qwen2.5-14B-Instruct-4bit" ; MODEL_LABEL="Qwen2.5-14B-4bit" ;;
            Qwen2.5-32B-4bit) MODEL="mlx-community/Qwen2.5-32B-Instruct-4bit" ; MODEL_LABEL="Qwen2.5-32B-4bit" ;;
            Qwen2.5-32B-8bit) MODEL="mlx-community/Qwen2.5-32B-Instruct-8bit" ; MODEL_LABEL="Qwen2.5-32B-8bit" ;;
            *)
                echo "Error: Unknown summary model '$MODEL_FLAG'. Valid labels: Qwen2.5-7B-4bit Qwen2.5-14B-4bit Qwen2.5-32B-4bit Qwen2.5-32B-8bit" >&2
                exit 1
                ;;
        esac
    fi
else
    # Default: Qwen2.5-32B-4bit (README-recommended)
    MODEL="mlx-community/Qwen2.5-32B-Instruct-4bit"
    MODEL_LABEL="Qwen2.5-32B-4bit"
fi

# ── Summary style selection ──────────────────────────────────────────────────

if [ -n "$STYLE_FLAG" ]; then
    case "$STYLE_FLAG" in
        executive|detailed|bullets|chapters|blog)
            STYLE="$STYLE_FLAG"
            ;;
        *)
            echo "Error: Unknown style '$STYLE_FLAG'. Valid styles: executive detailed bullets chapters blog" >&2
            exit 1
            ;;
    esac
else
    # Default: blog (README-recommended)
    STYLE="blog"
fi

BASENAME="${TRANSCRIPT_FILE%.*}"
OUTPUT_FILE="${BASENAME}_summary_${MODEL_LABEL}_${STYLE}.md"

echo ""
echo "Model:   $MODEL"
echo "Style:   $STYLE"
echo "Input:   $TRANSCRIPT_FILE"
echo "Output:  $OUTPUT_FILE"
echo ""

# ── Run summarization via Python ─────────────────────────────────────────────

TRANSCRIPT_FILE="$TRANSCRIPT_FILE" OUTPUT_FILE="$OUTPUT_FILE" MODEL="$MODEL" MODEL_LABEL="$MODEL_LABEL" STYLE="$STYLE" "$PYTHON" << 'PYTHON_SCRIPT'
import sys
import os
import time

transcript_file = os.environ['TRANSCRIPT_FILE']
output_file = os.environ['OUTPUT_FILE']
model_name = os.environ['MODEL']
model_label = os.environ['MODEL_LABEL']
style = os.environ['STYLE']

# ── Read transcript ──────────────────────────────────────────────────────────

with open(transcript_file, 'r') as f:
    lines = f.readlines()

# Strip metadata header (Model:, Source:, Date: lines from transcribe.sh)
content_lines = []
header_done = False
metadata = {}
for line in lines:
    if not header_done:
        if line.startswith('Model:'):
            metadata['transcription_model'] = line.split(':', 1)[1].strip()
            continue
        if line.startswith('Source:'):
            metadata['source'] = line.split(':', 1)[1].strip()
            continue
        if line.startswith('Date:'):
            metadata['date'] = line.split(':', 1)[1].strip()
            continue
        if line.strip() == '':
            continue
        header_done = True
    content_lines.append(line)

transcript = ''.join(content_lines).strip()
if not transcript:
    transcript = ''.join(lines).strip()

words = transcript.split()
total_words = len(words)
print(f"Transcript: {total_words:,} words")

# ── Build prompts per style ──────────────────────────────────────────────────

STYLE_PROMPTS = {
    "executive": """Write a concise executive summary of this video transcript.

Format:
- Start with a 1-2 sentence overview of what the video is about
- Write 1-2 paragraphs covering the main arguments and conclusions
- End with a "Key Takeaways" section listing 3-5 bullet points
- Keep the total summary under 500 words""",

    "detailed": """Write a detailed, structured summary of this video transcript.

Format:
- Start with a brief overview paragraph
- Create logical sections with clear headings (use ## for headings)
- Under each section, use bullet points for key arguments and details
- Include important quotes or statistics mentioned (use > blockquotes)
- End with a "Key Takeaways" section
- Aim for a thorough but readable summary""",

    "bullets": """Summarize this video transcript as a concise bullet-point list.

Format:
- Group related points under topic headings (use ## for headings)
- Use clear, complete sentences for each bullet
- Include specific claims, numbers, or examples mentioned
- Aim for 15-30 bullet points total
- No prose paragraphs, bullets only""",

    "chapters": """Create a chapter-by-chapter breakdown of this video transcript.

Format:
- Identify natural topic transitions and create chapters
- For each chapter, provide:
  - A descriptive chapter title (## heading)
  - 2-3 sentence summary of that section
  - Key points as bullet list
- Number the chapters sequentially
- Note when the speaker transitions between topics""",

    "blog": """Transform this video transcript into a compelling, engaging blog post that readers can't put down.

CRITICAL — ENGAGEMENT RULES (DocFlow framework):

1. OPENING HOOK (mandatory, first 150 words):
   - Start with a specific, intriguing question that the video answers
   - OR a "you" + problem statement ("Ever struggled with X?", "Tired of Y?")
   - OR a surprising statistic/claim from the video
   - OR an explicit outcome promise ("By the end, you'll understand...")
   - NEVER start with generic introductions like "In this video..." or "The speaker discusses..."

2. CURIOSITY & INFORMATION GAPS:
   - Use question-based headings (## Why Does X Happen? ## How Can You Avoid Y?)
   - Create micro-cliffhangers at section transitions ("But here's where it gets interesting...")
   - Pose each problem/question BEFORE giving the answer
   - Use concrete specifics, not abstractions

3. READER-CENTERED VOICE:
   - Use "you" and "your" frequently — address the reader directly
   - Active voice only (NEVER "it was explained that..." — use "the speaker explains...")
   - Conversational tone: contractions (you'll, here's, let's), questions, informal phrasing
   - High "you" to "the video/speaker" ratio (aim for 3:1)

4. CLARITY & SCANNABILITY:
   - Short sentences (max 25 words average)
   - Descriptive, keyword-rich headings (not "Overview" or "Details")
   - Break dense sections into subsections with ## or #### headings
   - Frontload key points in each paragraph (topic sentence first)

5. CONCRETE EXAMPLES:
   - Every major concept needs a concrete example with specific details
   - NEVER use placeholder names (foo, bar, example, test, sample)
   - Use domain-specific, real-world examples from the transcript
   - Include quotes from the speaker when they're vivid or specific

6. FLOW & MOMENTUM:
   - Use transition phrases between sections ("Now that you understand X...", "Let's see how...", "Here's where it gets interesting...")
   - End with "What's Next" or "Key Takeaways" section
   - Create narrative arc: setup problem → explore solutions → resolution/insights

7. FORBIDDEN LLM VOCABULARY (use natural alternatives):
   - NEVER: delve, tapestry, landscape, realm, embark, cornerstone, underpinning, pivotal, paramount, robust, meticulous
   - NEVER: "It's worth noting that", "It's important to understand", "At the end of the day"
   - AVOID excessive em dashes (—), use regular dashes (-) or commas
   - Write like a human expert, NOT a language model

8. STRUCTURE:
   - Opening hook paragraph (150 words)
   - Problem/context section with question-based heading
   - 3-5 main insight sections, each with:
     * Clear, specific heading
     * Topic sentence stating main point
     * Supporting details with examples/quotes
     * Transition to next section
   - "Key Takeaways" section (3-5 bullets, action-oriented)
   - "What's Next" or final thought

OUTPUT REQUIREMENTS:
- Clean Markdown with ## headings for major sections, #### for subsections
- No meta-commentary ("This blog post...", "The video covers...")
- Preserve all factual claims and insights from the transcript
- 800-1500 words (comprehensive but focused)
- Include 2-3 direct quotes from the speaker (use > blockquotes)
- Suggest where images/diagrams would enhance understanding [IMAGE: description]

Remember: Your goal is to make this content so engaging that readers can't stop reading. Use all the psychological triggers — curiosity gaps, reader focus, momentum, concrete examples — to create addictive documentation."""
}

SYSTEM_PROMPT = """You are an expert content writer specializing in transforming video transcripts into compelling written content.

Core Principles:
- Capture ALL major points and arguments — comprehensive coverage required
- Preserve specific claims, statistics, names, examples, and quotes exactly as stated
- Maintain the logical flow and structure of the original discussion
- Use clear, precise, human language (never robotic or formulaic)
- NEVER add opinions, interpretations, or information not in the transcript
- NEVER use LLM telltale words: delve, tapestry, landscape, realm, embark, cornerstone, robust, meticulous, pivotal, paramount
- NEVER use filler phrases: "It's worth noting", "At the end of the day", "It's important to understand"
- Write like a skilled human writer, not an AI assistant

Output Format:
- Clean, well-structured Markdown
- For blog posts: apply DocFlow engagement framework (see style instructions)
- For summaries: focus on clarity and comprehensiveness
- Always preserve the speaker's voice and key insights"""

style_instruction = STYLE_PROMPTS[style]

# ── Chunking strategy ────────────────────────────────────────────────────────
# Qwen 2.5 32B: 32k token context window
# Budget: ~4k output + ~2k prompt = 6k overhead → 26k tokens available for input
# At ~1.3 tokens/word, safely handle up to 20k words in single pass

MAX_SINGLE_PASS_WORDS = 20000
CHUNK_SIZE = 18000  # If chunking needed, use large chunks
MAX_SUMMARY_TOKENS = 4096

# Only chunk if transcript exceeds single-pass capacity
if total_words <= MAX_SINGLE_PASS_WORDS:
    chunks = [transcript]
    print(f"Transcript fits in single pass ({total_words:,} ≤ {MAX_SINGLE_PASS_WORDS:,} words)")
else:
    # Split at sentence boundaries for very long transcripts
    chunks = []
    current_chunk = []
    current_count = 0

    for word in words:
        current_chunk.append(word)
        current_count += 1
        if current_count >= CHUNK_SIZE and word.endswith(('.', '!', '?', '."', '?"', '!"')):
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_count = 0

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    print(f"Transcript exceeds single-pass limit, using {len(chunks)} chunks")

print()

# ── Load model ───────────────────────────────────────────────────────────────

print(f"Loading model: {model_name}")
t0 = time.time()
from mlx_lm import load, generate

model, tokenizer = load(model_name)
load_time = time.time() - t0
print(f"Model loaded in {load_time:.1f}s")
print()

# ── Generate summaries ───────────────────────────────────────────────────────

def run_llm(system, user, max_tokens=MAX_SUMMARY_TOKENS):
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]
    formatted = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    t0 = time.time()
    response = generate(
        model,
        tokenizer,
        prompt=formatted,
        max_tokens=max_tokens,
        verbose=False,
    )
    elapsed = time.time() - t0
    # Rough token count for speed reporting
    out_tokens = len(response.split()) * 1.3  # approximate
    speed = out_tokens / elapsed if elapsed > 0 else 0
    print(f"  Generated ~{int(out_tokens)} tokens in {elapsed:.1f}s ({speed:.1f} tok/s)")
    return response.strip()


if len(chunks) == 1:
    # Single-pass summarization (optimal path)
    print("Generating summary in single pass...")
    user_prompt = f"{style_instruction}\n\nTranscript:\n\n{chunks[0]}"
    summary = run_llm(SYSTEM_PROMPT, user_prompt)

else:
    # Multi-pass: summarize each chunk, then synthesize (only for very long transcripts)
    print(f"Multi-pass summarization ({len(chunks)} chunks)...")
    print(f"Pass 1: Summarizing chunks individually...")
    chunk_summaries = []

    for i, chunk in enumerate(chunks):
        print(f"  Chunk {i+1}/{len(chunks)} ({len(chunk.split()):,} words)...")
        user_prompt = (
            f"Summarize this section of a longer video transcript. "
            f"This is part {i+1} of {len(chunks)}.\n\n"
            f"Capture all key points, arguments, names, and specific claims.\n\n"
            f"Transcript section:\n\n{chunk}"
        )
        chunk_summary = run_llm(SYSTEM_PROMPT, user_prompt, max_tokens=2048)
        chunk_summaries.append(chunk_summary)

    print()
    print("Pass 2: Synthesizing final summary from chunk summaries...")

    combined = "\n\n---\n\n".join(
        f"## Section {i+1}\n{s}" for i, s in enumerate(chunk_summaries)
    )
    synthesis_prompt = (
        f"{style_instruction}\n\n"
        f"Below are summaries of consecutive sections of a video transcript. "
        f"Synthesize them into a single, coherent summary that flows naturally "
        f"and eliminates redundancy.\n\n{combined}"
    )
    summary = run_llm(SYSTEM_PROMPT, synthesis_prompt)

# ── Write output ─────────────────────────────────────────────────────────────

source_name = metadata.get('source', transcript_file.rsplit('/', 1)[-1])
output_words = len(summary.split())

# Different front matter for blog posts vs summaries
if style == 'blog':
    header = f"# {source_name.replace('_transcript.txt', '').replace('_', ' ')}\n\n"
    header += f"*Originally from: {source_name}*\n\n"
    header += f"---\n\n"
else:
    header = f"# Summary: {source_name}\n\n"
    header += f"| | |\n|---|---|\n"
    header += f"| **Source** | {source_name} |\n"
    header += f"| **Words** | {total_words:,} |\n"
    header += f"| **Model** | {model_label} |\n"
    header += f"| **Style** | {style} |\n"
    header += f"\n---\n\n"

with open(output_file, 'w') as f:
    f.write(header)
    f.write(summary)
    f.write('\n')

print()
print(f"{'Blog post' if style == 'blog' else 'Summary'} saved to: {output_file}")
print(f"Transcript: {total_words:,} words → Output: {output_words:,} words")
PYTHON_SCRIPT

echo ""
echo "Done!"
if [ "$STYLE" = "blog" ]; then
    echo "Blog post: $OUTPUT_FILE"
else
    echo "Summary: $OUTPUT_FILE"
fi
echo "OUTPUT_FILE=$OUTPUT_FILE"