[apps/webscrape] : Manually ensure csv is formatted correctly (#482)

skeydan · web-flow · commit dd361411ffcd · 2025-04-10T00:53:17.000-07:00
diff --git a/apps/webscrape/app.py b/apps/webscrape/app.py
@@ -7,10 +7,10 @@
 import sys
 import os
 import csv
+import re
 
 # browser-use imports and setup
-from browser_use import Agent, Browser, BrowserConfig, SystemPrompt, ActionResult
-from browser_use.browser.context import BrowserContextConfig, BrowserContext
+from browser_use import Agent, Browser, BrowserConfig
 from browser_use.agent.service import Agent
 from langchain_openai import ChatOpenAI
 
@@ -23,19 +23,30 @@
 # utility functions and variables
 __location__ = os.path.realpath(
     os.path.join(os.getcwd(), os.path.dirname(__file__)))
+EOL = "eol"
 
 file_path = '.user/staff.csv'
 os.makedirs(os.path.dirname(file_path), exist_ok = True)
 
 def append_csv(data):
+    splitter = re.compile(f"\s*{re.escape(EOL)}\s*\n?", re.IGNORECASE)
+    rows = [row.strip() for row in re.split(splitter, data) if row.strip()]
+
+    processed_rows = []
+    for row in rows:
+        fields = [field.strip() for field in row.split(',')]
+        processed_rows.append(fields) 
+        
     if os.path.exists(file_path):
         with open(file_path, 'a', newline='') as file:
-            writer = csv.writer(file)
-            writer.writerow(data)
+            writer = csv.writer(file)  
+            for row in processed_rows:
+                writer.writerow(row)
     else:
         with open(file_path, 'w', newline='') as file:
-            writer = csv.writer(file)
-            writer.writerow(data)
+            writer = csv.writer(file)  
+            for row in processed_rows:
+                writer.writerow(row)
 
 # browser-use config
 with open(os.path.join(__location__, 'browseruse_prompt.md'), 'r') as f:
@@ -81,7 +92,10 @@ async def main():
     result = (await run(agent, browser)).final_result()
 
     # ask openai to generate a csv file from this
-    csv_prompt = openai_prompt + "This is the company it's about: " + user_input + "And this is the JSON: "  + result
+    csv_prompt = openai_prompt +\
+        "This is the company it's about: " + user_input +\
+             ". And this is the JSON: " + result  +\
+                 ". And this is the value to be written into the EOL column: " + EOL
 
     messages = h9.load("messages", [])
     messages.append({"role": "user", "content": csv_prompt})
@@ -91,13 +105,10 @@ async def main():
     for chunk in completion:
         if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
             content = chunk.choices[0].delta.content
-            print(content, end="")
+            print(content, end = "")
             response += content
-
     # append to existing csv file
     append_csv(response)
-    print(os.path.join("Staff information saved at: " + file_path))
-
 asyncio.run(main())
 
 
diff --git a/apps/webscrape/browseruse_prompt.md b/apps/webscrape/browseruse_prompt.md
@@ -5,7 +5,8 @@ I want to know who works at a certain company. Concretely, I want to know:
  - the url of their github repository (if available)
  - the url of their linkedin profile (if available)
 
-I want you to gather that information directly from the company's website, for example, from a webpage called "teams".
+I want you to gather that information directly from the company's website, for example, from a webpage called "teams". To find the url for the company's website, use duckduckgo.com as your search engine.
+I.e., do NOT EVER go to LinkedIn or GitHub themselves!
 
 I want you to report back that information as a list of JSON objects, where every object has the following keys:
  - full_name
@@ -17,7 +18,6 @@ I want you to report back that information as a list of JSON objects, where ever
 Insert an empty string as a value if a piece of information is not available.
 
 Important: do NOT infer job_title from team, nor team from job_title. For both, only report a value if it is given on the website; otherwise, leave empty.
-
-For links, only report the url, not the display text. I.e., no markdown!
+And:  Do NOT follow the GitHub and LinkedIn links you are extracting! ONLY report them (if available from the company website)!
 
 This is the company I want to know the people of: 
diff --git a/apps/webscrape/openai_prompt.md b/apps/webscrape/openai_prompt.md
@@ -9,13 +9,14 @@ Start immediately with the header (no additional comments or preambles). The hea
 - full_name
 - github_link
 - linkedin_link
+- eol
 
 Leave empty any information you are not given. 
 
 Here is an example:
 
-company_name, team, job_title, full_name, github_link, linkedin_link
-SomeComp, Data Science, Junior Data Scientist, Julio Álvarez, https://www.github.com/juli, 
-SomeComp, Data Science, Team Lead, Julieta Marquez, https://www.github.com/julieta7777, https://www.linkedin.com/in/julieta-marquez-2330ob184/
-SomeComp, Legal, Advisor, João Souza, , 
-SomeComp, Accounting, , Davide Romano, , https://www.linkedin.com/in/davide-romano-2331ob184/
+company_name, team, job_title, full_name, github_link, linkedin_link, eol
+SomeComp, Data Science, Junior Data Scientist, Julio Álvarez, https://www.github.com/juli, eol
+SomeComp, Data Science, Team Lead, Julieta Marquez, https://www.github.com/julieta7777, https://www.linkedin.com/in/julieta-marquez-2330ob184/, eol
+SomeComp, Legal, Advisor, João Souza, , , eol
+SomeComp, Accounting, , Davide Romano, , https://www.linkedin.com/in/davide-romano-2331ob184/, eol