forked from poe-platform/server-bot-quick-start
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathbot_H1B.py
292 lines (246 loc) · 7.35 KB
/
bot_H1B.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
"""
BOT_NAME="H1B"; modal deploy --name $BOT_NAME bot_${BOT_NAME}.py; curl -X POST https://api.poe.com/bot/fetch_settings/$BOT_NAME/$POE_ACCESS_KEY
Test message:
How many h1b1 were issued?
"""
import os
import modal
from fastapi_poe import make_app
from modal import Stub, asgi_app, Image, App
import bot_PythonAgent
from bot_PythonAgent import PythonAgentBot
# https://modalbetatesters.slack.com/archives/C031Z7H15DG/p1675177408741889?thread_ts=1675174647.477169&cid=C031Z7H15DG
modal.app._is_container_app = False
PYTHON_AGENT_SYSTEM_PROMPT = """
You have access to the H-1B dataset in h1b.csv.
You write the Python code to answer my queries, whenever possible.
When you return Python code
- Encapsulate all Python code within triple backticks (i.e ```python) with newlines.
- The Python code should either print something or plot something
- When filtering rows by <class 'str'> columns, always use .str.contains(<string>, case=False) instead of ==
- The Python code should start with `df = pd.read_csv('/h1b.csv')` (NOTE: this is in the root directory /)
h1b.csv contains information about Labor application information from H-1B, H-1B1, and E-3 Programs.
h1b.csv contains the following columns
- 'CASE_NUMBER'
- 'CASE_STATUS',
- 'RECEIVED_DATE'
- 'DECISION_DATE'
- 'ORIGINAL_CERT_DATE'
- 'VISA_CLASS'
- 'JOB_TITLE'
- 'SOC_TITLE'
- 'FULL_TIME_POSITION'
- 'BEGIN_DATE'
- 'END_DATE'
- 'EMPLOYER_NAME'
- 'AGENT_REPRESENTING_EMPLOYER'
- 'LAWFIRM_NAME_BUSINESS_NAME'
- 'SECONDARY_ENTITY'
- 'SECONDARY_ENTITY_BUSINESS_NAME'
- 'WORKSITE_STATE'
- 'WAGE_RATE_OF_PAY_FROM'
- 'WAGE_RATE_OF_PAY_TO'
- 'WAGE_UNIT_OF_PAY'
- 'YEAR'
- 'QUARTER'
The five most common values for each column is as listed.
I-200-20281-869622 5
I-200-20329-926703 5
I-200-20281-866764 5
I-200-20283-875087 5
I-200-20281-868228 5
Name: CASE_NUMBER, type: <class 'str'>
Certified 2366621
Certified - Withdrawn 118268
Withdrawn 47581
Denied 13921
Name: CASE_STATUS, type: <class 'str'>
2020-10-07 00:00:00 30757
2020-12-09 00:00:00 14006
2020-12-10 00:00:00 9146
2020-12-14 00:00:00 6610
2020-12-11 00:00:00 6436
Name: RECEIVED_DATE, type: <class 'str'>
2020-10-15 00:00:00 29251
2020-12-16 00:00:00 13401
2020-12-17 00:00:00 9003
2020-11-25 00:00:00 8926
2021-02-22 00:00:00 6950
Name: DECISION_DATE, type: <class 'str'>
2020-10-15 00:00:00 1257
2020-12-16 00:00:00 548
2021-06-25 00:00:00 534
2020-05-13 00:00:00 406
2021-04-22 00:00:00 381
Name: ORIGINAL_CERT_DATE, type: <class 'float'>
H-1B 2481378
E-3 Australian 51029
H-1B1 Chile 7802
H-1B1 Singapore 6182
Name: VISA_CLASS, type: <class 'str'>
SOFTWARE ENGINEER 137371
SOFTWARE DEVELOPER 101873
SENIOR SOFTWARE ENGINEER 37190
MANAGER JC50 25504
SENIOR SYSTEMS ANALYST JC60 22488
Name: JOB_TITLE, type: <class 'str'>
Software Developers, Applications 624758
Software Developers 209165
Computer Systems Analysts 160672
Software Developers, Systems Software 116219
Computer Systems Engineers/Architects 86964
Name: SOC_TITLE, type: <class 'str'>
Y 2506698
N 39693
Name: FULL_TIME_POSITION, type: <class 'str'>
2022-10-01 00:00:00 102403
2020-10-01 00:00:00 89445
2021-10-01 00:00:00 85991
2023-10-01 00:00:00 71318
2021-01-01 00:00:00 14035
Name: BEGIN_DATE, type: <class 'str'>
2025-09-30 00:00:00 100715
2024-09-30 00:00:00 87862
2023-09-30 00:00:00 86452
2026-09-30 00:00:00 69101
2024-06-30 00:00:00 13131
Name: END_DATE, type: <class 'str'>
COGNIZANT TECHNOLOGY SOLUTIONS US CORP 74619
AMAZON.COM SERVICES LLC 54156
GOOGLE LLC 48220
TATA CONSULTANCY SERVICES LIMITED 44031
ERNST & YOUNG U.S. LLP 40673
Name: EMPLOYER_NAME, type: <class 'str'>
Yes 1438149
No 530908
Y 405668
N 171666
Name: AGENT_REPRESENTING_EMPLOYER, type: <class 'str'>
FRAGOMEN, DEL REY, BERNSEN & LOEWY, LLP 261280
BERRY APPLEMAN & LEIDEN LLP 141684
FRAGOMEN, DEL REY, BERNSEN & LOEWY LLP 50628
OGLETREE, DEAKINS, NASH, SMOAK & STEWART, P.C. 45981
SEYFARTH SHAW LLP 38399
Name: LAWFIRM_NAME_BUSINESS_NAME, type: <class 'str'>
No 1515532
Yes 453525
N 376986
Y 198387
Name: SECONDARY_ENTITY, type: <class 'str'>
WELLS FARGO 6375
FORD MOTOR COMPANY 5562
VERIZON 4821
CAPITAL ONE 4608
FIDELITY INVESTMENTS 4528
Name: SECONDARY_ENTITY_BUSINESS_NAME, type: <class 'float'>
CA 517753
TX 306873
NY 202901
WA 145738
NJ 134790
Name: WORKSITE_STATE, type: <class 'str'>
120000.0 36230
100000.0 32325
110000.0 30649
130000.0 28461
90000.0 28287
Name: WAGE_RATE_OF_PAY_FROM, type: <class 'numpy.float64'>
120000.0 15849
130000.0 13469
100000.0 13398
150000.0 13069
140000.0 12260
Name: WAGE_RATE_OF_PAY_TO, type: <class 'numpy.float64'>
Year 2394108
Hour 148881
Month 2299
Bi-Weekly 553
Week 548
Name: WAGE_UNIT_OF_PAY, type: <class 'str'>
2021 826305
2022 626084
2020 577334
2023 516668
Name: YEAR, type: <class 'numpy.int64'>
3 1018294
2 753801
1 411680
4 362616
Name: QUARTER, type: <class 'numpy.int64'>
"""
# To print the statistics
# for column in df.columns:
# print(str(df[column].value_counts().head(5)).replace(
# "dtype: int64",
# "type: " + str(type(df[column][0]))
# ))
# print()
CODE_WITH_WRAPPERS = """\
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
import warnings
import pandas as pd
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
def save_image(filename):
def decorator(func):
def wrapper(*args, **kwargs):
func(*args, **kwargs)
savefig(filename)
return wrapper
return decorator
plt.show = save_image('image.png')(plt.show)
plt.savefig = save_image('image.png')(plt.savefig)
{code}
"""
SIMULATED_USER_SUFFIX_PROMPT = """
If there is an issue, you will fix the Python code.
Otherwise, provide a brief and concise summary, WITHOUT repeating the output.
Write in normal markdown.
"""
IMAGE_EXEC = (
Image
.debian_slim()
.pip_install(
"ipython",
"scipy",
"matplotlib",
"scikit-learn",
"pandas",
"ortools",
"openai",
"requests",
"beautifulsoup4",
"newspaper3k",
"XlsxWriter",
"docx2txt",
"markdownify",
"pdfminer.six",
"Pillow",
"sortedcontainers",
"intervaltree",
"geopandas",
"basemap",
"tiktoken",
"basemap-data-hires",
"yfinance",
"dill",
"seaborn",
"openpyxl",
"cartopy",
"wordcloud",
)
.copy_local_file(
"h1b.csv", "h1b.csv"
)
)
class H1BBot(PythonAgentBot):
prompt_bot = "Claude-3.5-Sonnet"
code_iteration_limit = 3
logit_bias = {}
allow_attachments = False
python_agent_system_prompt = PYTHON_AGENT_SYSTEM_PROMPT
code_with_wrappers = CODE_WITH_WRAPPERS
simulated_user_suffix_prompt = SIMULATED_USER_SUFFIX_PROMPT
image_exec = IMAGE_EXEC