-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt35.py
98 lines (82 loc) · 4.11 KB
/
gpt35.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
from g4f.client import Client # Adjust this import to your actual GPT client's import path
import sys
# Assuming you have a configured GPT client that can send requests
client = Client()
def find_files(directory="."):
"""Find the Excel and corresponding text file in the specified directory."""
for file in os.listdir(directory):
if file.endswith(".xlsx"):
excel_file = os.path.join(directory, file)
txt_file = os.path.splitext(excel_file)[0] + ".txt"
if os.path.exists(txt_file):
return excel_file, txt_file
return None, None
def load_prompt(txt_file):
"""Load the prompt from the text file."""
with open(txt_file, 'r', encoding='utf-8') as file:
return file.read().strip()
def process_row(prompt, row_content, row_number, excel_basename, response_dir):
"""Send the row content along with the prompt to the GPT API and save the response."""
full_prompt = f"{prompt} {row_content}"
try:
response = client.chat.completions.create(
model="gpt-3.5-turbo", # Specify your model here
messages=[{"role": "user", "content": full_prompt}]
).choices[0].message.content
response_filename = os.path.join(response_dir, f"{excel_basename}_{row_number}.txt")
with open(response_filename, 'w', encoding='utf-8') as file:
file.write(response)
return True
except Exception as e:
print(f"Error processing row {row_number}: {e}")
return False
def main():
start_time = datetime.now()
excel_file, txt_file = find_files()
if not excel_file or not txt_file:
print("Excel or text file not found.")
return
prompt = load_prompt(txt_file)
dataframe = pd.read_excel(excel_file)
responses_dir = os.path.splitext(excel_file)[0] + "_responses"
if not os.path.exists(responses_dir):
os.makedirs(responses_dir)
# Identify the next row to process
processed_files = os.listdir(responses_dir)
next_row = max([int(f.split('_')[-1].split('.')[0]) for f in processed_files] + [0])
total_rows = len(dataframe)
rows_to_process = total_rows - next_row
if rows_to_process == 0:
print("All rows have already been processed.")
return
processed_count = 0
max_requests = 50 # Limit to 50 requests per run
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for index, row in dataframe.iterrows():
row_number = index + 1
if row_number <= next_row:
continue # Skip already processed rows
if processed_count >= max_requests:
break # Stop after processing 50 rows
excel_basename = os.path.splitext(os.path.basename(excel_file))[0]
futures.append(executor.submit(process_row, prompt, row['Text'], row_number, excel_basename, responses_dir))
processed_count += 1
# Real-time progress updates moved outside the futures submission loop
for i, future in enumerate(as_completed(futures)):
elapsed_time = (datetime.now() - start_time).total_seconds()
rows_processed = i + 1
estimated_time_per_row = elapsed_time / rows_processed
estimated_total_time = estimated_time_per_row * rows_to_process
estimated_completion_time = start_time + timedelta(seconds=estimated_total_time)
time_left = estimated_total_time - elapsed_time
rows_left = rows_to_process - rows_processed
print(f"\rProgress: {rows_processed}/{rows_to_process} rows processed. Estimated completion: {estimated_completion_time.strftime('%Y-%m-%d %H:%M:%S')}. Time left: {time_left/60:.2f} minutes. Rows left: {rows_left}.", end='')
sys.stdout.flush()
print("\nProcessing completed or maximum requests reached. Please rerun to continue.")
if __name__ == "__main__":
main()