AgenticCrawling/input_handler.py at master · solita/AgenticCrawling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
Search and input handling for website discovery
Supports both search-term based and CSV list-based input
"""
import csv
import asyncio
from typing import List, Dict, Any
from pathlib import Path
from urllib.parse import urlparse
from loguru import logger
from ddgs import DDGS
import pandas as pd


class SearchHandler:
    """Handles search-term based website discovery"""

    def __init__(self, num_results: int = 5):
        self.num_results = num_results

    def search_companies(self, search_term: str) -> List[str]:
        """
        Search for companies using DuckDuckGo and return URLs

        Args:
            search_term: Search query (e.g., "Software development consultancy finland")

        Returns:
            List of website URLs
        """
        logger.info(f"Searching for: '{search_term}' (top {self.num_results} results)")

        try:
            urls = []

            # Use DuckDuckGo search (more reliable than Google for automation)
            with DDGS() as ddgs:
                results = ddgs.text(
                    search_term,
                    region='wt-wt',  # International English results
                    max_results=self.num_results * 3  # Get more to filter
                )

                for result in results:
                    url = result.get('href') or result.get('link')
                    if not url:
                        continue

                    # Basic filtering to get company websites
                    parsed = urlparse(url)

                    # Skip social media, job boards, listing sites, etc.
                    skip_domains = ['linkedin', 'facebook', 'twitter', 'instagram',
                                  'indeed', 'glassdoor', 'youtube', 'wikipedia',
                                  'reddit', 'quora', 'medium', 'pinterest',
                                  'clutch', 'goodfirms', 'themanifest', 'techbehemoths',
                                  'pitchbook', 'crunchbase', 'softwarecompanynetwork']

                    if any(skip in parsed.netloc.lower() for skip in skip_domains):
                        continue

                    # Skip non-http(s) URLs
                    if not url.startswith(('http://', 'https://')):
                        continue

                    urls.append(url)

                    if len(urls) >= self.num_results:
                        break

            logger.info(f"Found {len(urls)} relevant URLs")
            return urls

        except Exception as e:
            logger.error(f"Search failed: {e}")
            logger.info("Tip: Try using CSV upload as an alternative")
            return []

    def validate_url(self, url: str) -> bool:
        """Validate that a URL is properly formatted"""
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False


class CSVHandler:
    """Handles CSV list-based website input"""

    def __init__(self):
        pass

    def read_urls_from_csv(self, file_path: Path) -> List[Dict[str, str]]:
        """
        Read URLs from a CSV file

        Expected CSV format:
        - Must have a column named 'url' or 'URL' or 'website'
        - Optional: 'company' or 'name' column

        Args:
            file_path: Path to CSV file

        Returns:
            List of dictionaries with 'url' and optional 'company' keys
        """
        logger.info(f"Reading URLs from CSV: {file_path}")

        try:
            df = pd.read_csv(file_path)

            # Find URL column
            url_column = None
            for col in df.columns:
                if col.lower() in ['url', 'website', 'link']:
                    url_column = col
                    break

            if not url_column:
                raise ValueError("CSV must have a column named 'url', 'URL', 'website', or 'link'")

            # Find company name column
            name_column = None
            for col in df.columns:
                if col.lower() in ['company', 'name', 'company_name']:
                    name_column = col
                    break

            # Extract data
            results = []
            for _, row in df.iterrows():
                url = str(row[url_column]).strip()

                # Validate URL
                if not url or url.lower() in ['nan', 'none', '']:
                    continue

                # Add http:// if missing
                if not url.startswith(('http://', 'https://')):
                    url = 'https://' + url

                entry = {'url': url}

                if name_column and pd.notna(row[name_column]):
                    entry['company'] = str(row[name_column]).strip()

                results.append(entry)

            logger.info(f"Successfully loaded {len(results)} URLs from CSV")
            return results

        except Exception as e:
            logger.error(f"Error reading CSV: {e}")
            raise

    def create_sample_csv(self, output_path: Path):
        """Create a sample CSV file for reference"""
        sample_data = [
            {"company": "Example Corp", "url": "https://example.com"},
            {"company": "Test Inc", "url": "https://test-company.com"},
            {"company": "Demo Ltd", "url": "https://demo.fi"},
        ]

        df = pd.DataFrame(sample_data)
        df.to_csv(output_path, index=False)
        logger.info(f"Created sample CSV at {output_path}")


class InputCoordinator:
    """Coordinates between search and CSV input methods"""

    def __init__(self):
        self.search_handler = SearchHandler()
        self.csv_handler = CSVHandler()

    async def get_urls_from_search(self, search_term: str, num_results: int) -> List[Dict[str, str]]:
        """
        Get URLs using search term

        Returns:
            List of dicts with 'url' key
        """
        self.search_handler.num_results = num_results
        urls = self.search_handler.search_companies(search_term)
        return [{"url": url} for url in urls]

    async def get_urls_from_csv(self, csv_file_path: Path) -> List[Dict[str, str]]:
        """
        Get URLs from CSV file

        Returns:
            List of dicts with 'url' and optional 'company' keys
        """
        return self.csv_handler.read_urls_from_csv(csv_file_path)