-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinput_handler.py
More file actions
195 lines (149 loc) · 6.62 KB
/
input_handler.py
File metadata and controls
195 lines (149 loc) · 6.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
Search and input handling for website discovery
Supports both search-term based and CSV list-based input
"""
import csv
import asyncio
from typing import List, Dict, Any
from pathlib import Path
from urllib.parse import urlparse
from loguru import logger
from ddgs import DDGS
import pandas as pd
class SearchHandler:
"""Handles search-term based website discovery"""
def __init__(self, num_results: int = 5):
self.num_results = num_results
def search_companies(self, search_term: str) -> List[str]:
"""
Search for companies using DuckDuckGo and return URLs
Args:
search_term: Search query (e.g., "Software development consultancy finland")
Returns:
List of website URLs
"""
logger.info(f"Searching for: '{search_term}' (top {self.num_results} results)")
try:
urls = []
# Use DuckDuckGo search (more reliable than Google for automation)
with DDGS() as ddgs:
results = ddgs.text(
search_term,
region='wt-wt', # International English results
max_results=self.num_results * 3 # Get more to filter
)
for result in results:
url = result.get('href') or result.get('link')
if not url:
continue
# Basic filtering to get company websites
parsed = urlparse(url)
# Skip social media, job boards, listing sites, etc.
skip_domains = ['linkedin', 'facebook', 'twitter', 'instagram',
'indeed', 'glassdoor', 'youtube', 'wikipedia',
'reddit', 'quora', 'medium', 'pinterest',
'clutch', 'goodfirms', 'themanifest', 'techbehemoths',
'pitchbook', 'crunchbase', 'softwarecompanynetwork']
if any(skip in parsed.netloc.lower() for skip in skip_domains):
continue
# Skip non-http(s) URLs
if not url.startswith(('http://', 'https://')):
continue
urls.append(url)
if len(urls) >= self.num_results:
break
logger.info(f"Found {len(urls)} relevant URLs")
return urls
except Exception as e:
logger.error(f"Search failed: {e}")
logger.info("Tip: Try using CSV upload as an alternative")
return []
def validate_url(self, url: str) -> bool:
"""Validate that a URL is properly formatted"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
class CSVHandler:
"""Handles CSV list-based website input"""
def __init__(self):
pass
def read_urls_from_csv(self, file_path: Path) -> List[Dict[str, str]]:
"""
Read URLs from a CSV file
Expected CSV format:
- Must have a column named 'url' or 'URL' or 'website'
- Optional: 'company' or 'name' column
Args:
file_path: Path to CSV file
Returns:
List of dictionaries with 'url' and optional 'company' keys
"""
logger.info(f"Reading URLs from CSV: {file_path}")
try:
df = pd.read_csv(file_path)
# Find URL column
url_column = None
for col in df.columns:
if col.lower() in ['url', 'website', 'link']:
url_column = col
break
if not url_column:
raise ValueError("CSV must have a column named 'url', 'URL', 'website', or 'link'")
# Find company name column
name_column = None
for col in df.columns:
if col.lower() in ['company', 'name', 'company_name']:
name_column = col
break
# Extract data
results = []
for _, row in df.iterrows():
url = str(row[url_column]).strip()
# Validate URL
if not url or url.lower() in ['nan', 'none', '']:
continue
# Add http:// if missing
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
entry = {'url': url}
if name_column and pd.notna(row[name_column]):
entry['company'] = str(row[name_column]).strip()
results.append(entry)
logger.info(f"Successfully loaded {len(results)} URLs from CSV")
return results
except Exception as e:
logger.error(f"Error reading CSV: {e}")
raise
def create_sample_csv(self, output_path: Path):
"""Create a sample CSV file for reference"""
sample_data = [
{"company": "Example Corp", "url": "https://example.com"},
{"company": "Test Inc", "url": "https://test-company.com"},
{"company": "Demo Ltd", "url": "https://demo.fi"},
]
df = pd.DataFrame(sample_data)
df.to_csv(output_path, index=False)
logger.info(f"Created sample CSV at {output_path}")
class InputCoordinator:
"""Coordinates between search and CSV input methods"""
def __init__(self):
self.search_handler = SearchHandler()
self.csv_handler = CSVHandler()
async def get_urls_from_search(self, search_term: str, num_results: int) -> List[Dict[str, str]]:
"""
Get URLs using search term
Returns:
List of dicts with 'url' key
"""
self.search_handler.num_results = num_results
urls = self.search_handler.search_companies(search_term)
return [{"url": url} for url in urls]
async def get_urls_from_csv(self, csv_file_path: Path) -> List[Dict[str, str]]:
"""
Get URLs from CSV file
Returns:
List of dicts with 'url' and optional 'company' keys
"""
return self.csv_handler.read_urls_from_csv(csv_file_path)