Skip to content

Commit dd36141

Browse files
authored
[apps/webscrape] : Manually ensure csv is formatted correctly (#482)
1 parent 7713426 commit dd36141

File tree

3 files changed

+31
-19
lines changed

3 files changed

+31
-19
lines changed

apps/webscrape/app.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
import sys
88
import os
99
import csv
10+
import re
1011

1112
# browser-use imports and setup
12-
from browser_use import Agent, Browser, BrowserConfig, SystemPrompt, ActionResult
13-
from browser_use.browser.context import BrowserContextConfig, BrowserContext
13+
from browser_use import Agent, Browser, BrowserConfig
1414
from browser_use.agent.service import Agent
1515
from langchain_openai import ChatOpenAI
1616

@@ -23,19 +23,30 @@
2323
# utility functions and variables
2424
__location__ = os.path.realpath(
2525
os.path.join(os.getcwd(), os.path.dirname(__file__)))
26+
EOL = "eol"
2627

2728
file_path = '.user/staff.csv'
2829
os.makedirs(os.path.dirname(file_path), exist_ok = True)
2930

3031
def append_csv(data):
32+
splitter = re.compile(f"\s*{re.escape(EOL)}\s*\n?", re.IGNORECASE)
33+
rows = [row.strip() for row in re.split(splitter, data) if row.strip()]
34+
35+
processed_rows = []
36+
for row in rows:
37+
fields = [field.strip() for field in row.split(',')]
38+
processed_rows.append(fields)
39+
3140
if os.path.exists(file_path):
3241
with open(file_path, 'a', newline='') as file:
33-
writer = csv.writer(file)
34-
writer.writerow(data)
42+
writer = csv.writer(file)
43+
for row in processed_rows:
44+
writer.writerow(row)
3545
else:
3646
with open(file_path, 'w', newline='') as file:
37-
writer = csv.writer(file)
38-
writer.writerow(data)
47+
writer = csv.writer(file)
48+
for row in processed_rows:
49+
writer.writerow(row)
3950

4051
# browser-use config
4152
with open(os.path.join(__location__, 'browseruse_prompt.md'), 'r') as f:
@@ -81,7 +92,10 @@ async def main():
8192
result = (await run(agent, browser)).final_result()
8293

8394
# ask openai to generate a csv file from this
84-
csv_prompt = openai_prompt + "This is the company it's about: " + user_input + "And this is the JSON: " + result
95+
csv_prompt = openai_prompt +\
96+
"This is the company it's about: " + user_input +\
97+
". And this is the JSON: " + result +\
98+
". And this is the value to be written into the EOL column: " + EOL
8599

86100
messages = h9.load("messages", [])
87101
messages.append({"role": "user", "content": csv_prompt})
@@ -91,13 +105,10 @@ async def main():
91105
for chunk in completion:
92106
if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
93107
content = chunk.choices[0].delta.content
94-
print(content, end="")
108+
print(content, end = "")
95109
response += content
96-
97110
# append to existing csv file
98111
append_csv(response)
99-
print(os.path.join("Staff information saved at: " + file_path))
100-
101112
asyncio.run(main())
102113

103114

apps/webscrape/browseruse_prompt.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ I want to know who works at a certain company. Concretely, I want to know:
55
- the url of their github repository (if available)
66
- the url of their linkedin profile (if available)
77

8-
I want you to gather that information directly from the company's website, for example, from a webpage called "teams".
8+
I want you to gather that information directly from the company's website, for example, from a webpage called "teams". To find the url for the company's website, use duckduckgo.com as your search engine.
9+
I.e., do NOT EVER go to LinkedIn or GitHub themselves!
910

1011
I want you to report back that information as a list of JSON objects, where every object has the following keys:
1112
- full_name
@@ -17,7 +18,6 @@ I want you to report back that information as a list of JSON objects, where ever
1718
Insert an empty string as a value if a piece of information is not available.
1819

1920
Important: do NOT infer job_title from team, nor team from job_title. For both, only report a value if it is given on the website; otherwise, leave empty.
20-
21-
For links, only report the url, not the display text. I.e., no markdown!
21+
And: Do NOT follow the GitHub and LinkedIn links you are extracting! ONLY report them (if available from the company website)!
2222

2323
This is the company I want to know the people of:

apps/webscrape/openai_prompt.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@ Start immediately with the header (no additional comments or preambles). The hea
99
- full_name
1010
- github_link
1111
- linkedin_link
12+
- eol
1213

1314
Leave empty any information you are not given.
1415

1516
Here is an example:
1617

17-
company_name, team, job_title, full_name, github_link, linkedin_link
18-
SomeComp, Data Science, Junior Data Scientist, Julio Álvarez, https://www.github.com/juli,
19-
SomeComp, Data Science, Team Lead, Julieta Marquez, https://www.github.com/julieta7777, https://www.linkedin.com/in/julieta-marquez-2330ob184/
20-
SomeComp, Legal, Advisor, João Souza, ,
21-
SomeComp, Accounting, , Davide Romano, , https://www.linkedin.com/in/davide-romano-2331ob184/
18+
company_name, team, job_title, full_name, github_link, linkedin_link, eol
19+
SomeComp, Data Science, Junior Data Scientist, Julio Álvarez, https://www.github.com/juli, eol
20+
SomeComp, Data Science, Team Lead, Julieta Marquez, https://www.github.com/julieta7777, https://www.linkedin.com/in/julieta-marquez-2330ob184/, eol
21+
SomeComp, Legal, Advisor, João Souza, , , eol
22+
SomeComp, Accounting, , Davide Romano, , https://www.linkedin.com/in/davide-romano-2331ob184/, eol

0 commit comments

Comments
 (0)