Skip to content

Commit 2ab36e7

Browse files
author
Petr Lavrov
committed
Foundational refactor for WB 2.0
move all old code to archive Add new lib usage template
1 parent 1ae5fd3 commit 2ab36e7

24 files changed

+4606
-59
lines changed

.github/workflows/main.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@ jobs:
1515
# 3.7 doesn't support f"{a=}" syntax
1616
# python-version: [ "3.8", "3.9", "3.10", "3.11" ]
1717
# don't really need anything older than 3.10
18-
python-version: [ "3.10", "3.11" ]
18+
python-version: [ "3.12" ]
1919
steps:
2020
- uses: actions/checkout@v3
2121
- name: Set up Python ${{ matrix.python-version }}
2222
uses: actions/setup-python@v4
2323
with:
2424
python-version: ${{ matrix.python-version }}
25+
# todo: install ... ffmpeg for tests, I guess
2526
- name: Install Poetry
2627
run: |
2728
curl -sSL https://install.python-poetry.org | python3 -

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -157,4 +157,4 @@ cython_debug/
157157
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160-
#.idea/
160+
.idea/

dev/michael_example.py archive/dev/michael_example.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
import pickle
2-
31
import openai
2+
import pickle
43
from loguru import logger
54
from pydub import AudioSegment
65
from pydub.silence import detect_silence

archive/dev/todo.md

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Section 1 - improving existing bot
2+
3+
## feature 1.1 - improve logging
4+
5+
- [ ] Delete the 'generating' message from telegram once finished
6+
- [ ] Track time for generation - add to log
7+
- [ ] Report progress and remaining time to telegram
8+
- [ ] Can I have text-based tqdm?
9+
- [ ] I can probably send updates every 5 seconds
10+
- [ ] For sequential mode report the text generated so far
11+
12+
## feature 1.2 - add parallel processing
13+
14+
- [ ] Report processing mode in the status message
15+
- [ ] Calculate expected processing time
16+
- [ ] use parallel mode in the bot
17+
18+
## feature 1.3 - add config settings commands
19+
20+
- [ ] s1 add a command to configure parallel mode
21+
- [ ] s1 add parallel mode to config class
22+
- [ ] s4 add automatic command generation for all config fields
23+
24+
# Section 2 - developing new features
25+
26+
## feature 2.1 - gpt summary
27+
28+
- [ ] Generate punctuation with gpt
29+
30+
# Section 3 - common features (for bot base)
31+
32+
## feature 3.1 - gpt utils
33+
34+
- [ ] extract gpt utils to separate file
35+
- [ ] extract gpt utils to a separate lib
36+
37+
# Section 4 - unsorted
38+
39+
## feature 0.2 - gpt fill/extract the info
40+
41+
## feature 0.3 - gpt guess speakers?
42+
43+
## feature 3 - add retries
44+
45+
- On file download
46+
- On openai api
47+
- Option 1: decorator
48+
- Option 2: write a custom method
49+
50+
## feature 5 -
51+
52+
## feature 6 - add video support
53+
54+
- [ ] detect and download video from telegram (add handler)
55+
- [ ] extract audio from video
File renamed without changes.

archive/useless/run.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# run the app
2+
from bot_base.data_model.mongo_utils import connect_to_db
3+
from bot_base.utils.logging_utils import setup_logger
4+
from dotenv import load_dotenv
5+
from whisper_bot.core.app import WhisperApp
6+
7+
if __name__ == "__main__":
8+
load_dotenv()
9+
# connect to db
10+
connect_to_db()
11+
12+
# setup logger
13+
setup_logger()
14+
15+
app = WhisperApp()
16+
app.run()
File renamed without changes.
File renamed without changes.

archive/whisper_bot/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1+
from .app import WhisperApp
12
from .app_config import WhisperAppConfig
23
from .telegram_bot import WhisperTelegramBot
3-
from .app import WhisperApp

whisper_bot/core/app.py archive/whisper_bot/core/app.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33
- Database for storing processed whispers and logs
44
-
55
"""
6-
from functools import partial
7-
86
from bot_base.core import App
97
from bot_base.utils.gpt_utils import (
108
split_by_weight,
119
get_token_count,
1210
amap_gpt_command,
1311
token_limit_by_model,
1412
)
13+
from functools import partial
1514
from whisper_bot.core import WhisperAppConfig
1615
from whisper_bot.core import WhisperTelegramBot
1716
from whisper_bot.core.app_config import WhisperTelegramBotConfig
@@ -37,7 +36,9 @@ async def merge_and_format_chunks(self, chunks):
3736
)
3837
# step 2: merge chunks in each group
3938
self.logger.info(f"Merging chunks in each group")
40-
merged_groups = [merge_all_chunks(group, logger=self.logger) for group in groups]
39+
merged_groups = [
40+
merge_all_chunks(group, logger=self.logger) for group in groups
41+
]
4142
# step 3: format each group
4243
self.logger.info(f"Formatting each group")
4344
formatted_groups = await amap_gpt_command(
File renamed without changes.

whisper_bot/core/telegram_bot.py archive/whisper_bot/core/telegram_bot.py

+43-25
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
from datetime import datetime
2-
from textwrap import dedent
3-
from typing import TYPE_CHECKING
4-
51
from aiogram import F
62
from aiogram import types
7-
83
from bot_base.core import mark_command
94
from bot_base.core.telegram_bot import TelegramBot
10-
from bot_base.utils.text_utils import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, \
11-
split_text_with_overlap
5+
from bot_base.utils.text_utils import (
6+
DEFAULT_CHUNK_SIZE,
7+
DEFAULT_CHUNK_OVERLAP,
8+
split_text_with_overlap,
9+
)
10+
from datetime import datetime
11+
from textwrap import dedent
12+
from typing import TYPE_CHECKING
1213
from whisper_bot.core.app_config import WhisperTelegramBotConfig
1314
from whisper_bot.utils.text_utils import (
1415
merge_all_chunks,
@@ -43,11 +44,14 @@ async def process_audio(self, message: types.Message):
4344
raw_transcript = "\n\n".join(chunks)
4445
self.logger.info(f"Raw transcript", data=raw_transcript)
4546
if self.config.send_raw_transcript:
46-
filename = f"raw_transcript_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
47+
filename = (
48+
f"raw_transcript_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
49+
)
4750
await self.send_safe(
48-
chat_id=message.chat.id, text=raw_transcript,
51+
chat_id=message.chat.id,
52+
text=raw_transcript,
4953
reply_to_message_id=message.message_id,
50-
filename=filename
54+
filename=filename,
5155
)
5256

5357
# subsplit large chunks
@@ -58,8 +62,10 @@ async def process_audio(self, message: types.Message):
5862
self.logger.info("Transcript", data=transcript)
5963
filename = f"transcript_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
6064
await self.send_safe(
61-
chat_id=message.chat.id, text=transcript, reply_to_message_id=message.message_id,
62-
filename=filename
65+
chat_id=message.chat.id,
66+
text=transcript,
67+
reply_to_message_id=message.message_id,
68+
filename=filename,
6369
)
6470

6571
await placeholder.delete()
@@ -90,8 +96,9 @@ async def chat_message_handler(self, message: types.Message):
9096
return message_text
9197

9298
@staticmethod
93-
def subsplit_large_chunks(chunks, chunk_limit=DEFAULT_CHUNK_SIZE,
94-
overlap=DEFAULT_CHUNK_OVERLAP):
99+
def subsplit_large_chunks(
100+
chunks, chunk_limit=DEFAULT_CHUNK_SIZE, overlap=DEFAULT_CHUNK_OVERLAP
101+
):
95102
res_chunks = []
96103
for chunk in chunks:
97104
res_chunks += split_text_with_overlap(chunk, chunk_limit, overlap)
@@ -107,7 +114,8 @@ async def merge_chunks_command(self, message: types.Message):
107114
text = await self._extract_text_from_message(message)
108115

109116
# remove command from text
110-
if text.startswith("/"): _, text = text.split(maxsplit=1)
117+
if text.startswith("/"):
118+
_, text = text.split(maxsplit=1)
111119

112120
# step 2: split chunks
113121
chunks = text.split("\n\n")
@@ -122,8 +130,10 @@ async def merge_chunks_command(self, message: types.Message):
122130
# send back the result
123131
filename = f"merged_text_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
124132
await self.send_safe(
125-
chat_id=message.chat.id, text=result, reply_to_message_id=message.message_id,
126-
filename=filename
133+
chat_id=message.chat.id,
134+
text=result,
135+
reply_to_message_id=message.message_id,
136+
filename=filename,
127137
)
128138

129139
# ------------------------------------------------------------
@@ -141,7 +151,8 @@ async def format_text_command(self, message: types.Message):
141151
text = await self._extract_text_from_message(message)
142152

143153
# remove command from text
144-
if text.startswith("/"): _, text = text.split(maxsplit=1)
154+
if text.startswith("/"):
155+
_, text = text.split(maxsplit=1)
145156

146157
# format the text
147158
result = await format_text_with_gpt(
@@ -155,8 +166,10 @@ async def format_text_command(self, message: types.Message):
155166
# send back the result
156167
filename = f"formatted_text_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
157168
await self.send_safe(
158-
chat_id=message.chat.id, text=result, reply_to_message_id=message.message_id,
159-
filename=filename
169+
chat_id=message.chat.id,
170+
text=result,
171+
reply_to_message_id=message.message_id,
172+
filename=filename,
160173
)
161174

162175
@mark_command("fix_grammar")
@@ -167,7 +180,8 @@ async def fix_grammar_command(self, message: types.Message):
167180
text = await self._extract_text_from_message(message)
168181

169182
# remove command from text
170-
if text.startswith("/"): _, text = text.split(maxsplit=1)
183+
if text.startswith("/"):
184+
_, text = text.split(maxsplit=1)
171185

172186
# format the text
173187
result = await format_text_with_gpt(
@@ -181,8 +195,10 @@ async def fix_grammar_command(self, message: types.Message):
181195
# send back the result
182196
filename = f"formatted_text_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
183197
await self.send_safe(
184-
chat_id=message.chat.id, text=result, reply_to_message_id=message.message_id,
185-
filename=filename
198+
chat_id=message.chat.id,
199+
text=result,
200+
reply_to_message_id=message.message_id,
201+
filename=filename,
186202
)
187203

188204
# ------------------------------------------------------------
@@ -208,8 +224,10 @@ async def merge_and_format_command(self, message: types.Message):
208224
# send back the result
209225
filename = f"formatted_text_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
210226
await self.send_safe(
211-
chat_id=message.chat.id, text=result, reply_to_message_id=message.message_id,
212-
filename=filename
227+
chat_id=message.chat.id,
228+
text=result,
229+
reply_to_message_id=message.message_id,
230+
filename=filename,
213231
)
214232

215233
# ------------------------------------------------------------

whisper_bot/utils/text_utils.py archive/whisper_bot/utils/text_utils.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import loguru
2-
from difflib import SequenceMatcher
3-
from typing import Iterable
4-
52
from bot_base.utils.gpt_utils import (
63
arun_command_with_gpt,
74
token_limit_by_model,
85
get_token_count,
96
)
7+
from difflib import SequenceMatcher
8+
from typing import Iterable
109

1110

1211
def normalize_text(text):

0 commit comments

Comments
 (0)