ProxyGather/ProxyGather.py at master · Skillter/ProxyGather · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import argparse
import sys
import queue
import threading
from datetime import datetime
from collections import defaultdict
from typing import Set, Dict
from urllib.parse import urlparse
import re
import random

from ScrapeAllProxies import run_scraper_pipeline, list_available_scrapers, show_legal_disclaimer, DEFAULT_OUTPUT_FILE
from CheckProxies import run_checker_pipeline
from helper.termination import termination_context, should_terminate

def get_source_identifier(url: str, scraper_name: str) -> str:
    """Returns a clean source identifier from URL or scraper name."""
    if scraper_name not in ['Websites', 'Discover']:
        return scraper_name

    if not url or url == "N/A": return scraper_name

    gh_pattern = re.compile(r'https?://(?:www\.)?(?:cdn\.jsdelivr\.net/gh|fastly\.jsdelivr\.net/gh|raw\.githubusercontent\.com|github\.com)/([^/]+)/([^/@#?]+)')
    match = gh_pattern.search(url)
    if match:
        user, repo = match.groups()
        return f"github:{user}/{repo}"

    try:
        netloc = urlparse(url).netloc
        if netloc.startswith("www."): netloc = netloc[4:]
        return netloc
    except Exception:
        return scraper_name

def handle_pre_checks(args):
    """
    Handles 'List Scrapers' mode and Legal Disclaimer check on the main thread.
    Returns True if we should proceed, False if we should exit (e.g. printed list).
    """
    if (args.only is not None and not args.only) or (args.exclude is not None and not args.exclude):
        list_available_scrapers(args)
        return False

    if not args.compliant:
        show_legal_disclaimer(auto_accept=args.yes)

    return True

def cmd_scrape(args):
    print("=== ProxyGather Scraping Mode ===")
    if not handle_pre_checks(args):
        return

    # Use timestamped output if default is set
    if args.output == DEFAULT_OUTPUT_FILE:
         timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
         args.output = f"scraped-proxies-{timestamp}.txt"

    # Ensure 'threads' attribute exists for compatibility with ScrapeAllProxies
    if not hasattr(args, 'threads'):
        args.threads = args.scraper_threads

    final_proxies, results = run_scraper_pipeline(args, skip_disclaimer=True)

    # Display summary by source
    if results:
        print("\n" + "="*60)
        print(f"{'Source':<45} | {'Proxies':<10}")
        print("-" * 60)

        # Calculate per-source counts
        source_counts = {}
        for source, proxies in results.items():
            if source == 'Websites':
                # For Websites scraper, we don't have per-URL breakdown in results
                # so just use the total count
                source_counts[source] = len(proxies) if proxies else 0
            else:
                source_counts[source] = len(proxies) if proxies else 0

        # Sort by count descending
        sorted_stats = sorted(source_counts.items(), key=lambda x: x[1], reverse=True)
        for source, count in sorted_stats:
            display_source = source[:43] if len(source) > 43 else source
            print(f"{display_source:<45} | {count:<10}")

        print("-" * 60)
        print(f"{'TOTAL (Unique)':<45} | {len(final_proxies):<10}")
        print("="*60)

def cmd_check(args):
    print("=== ProxyGather Checker Mode ===")
    run_checker_pipeline(args)

def cmd_run(args):
    print("=== ProxyGather Unified Run Mode ===")

    if not handle_pre_checks(args):
        return

    # --- Filename Logic ---
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # 1. Scraper output: use --scraper-output if provided, otherwise timestamped default
    scraped_file = args.scraper_output if args.scraper_output else f"scraped-proxies-{timestamp}.txt"

    # 2. Checker output: use --checker-output if provided, otherwise None for timestamped default
    args.output = args.checker_output  # CheckProxies uses args.output

    proxy_queue = queue.Queue()
    stats = defaultdict(lambda: {'scraped': 0, 'working': 0})
    proxy_to_sources: Dict[str, Set[str]] = defaultdict(set)
    checked_cache: Dict[str, bool] = {}
    stats_lock = threading.Lock()

    # Load additional proxies from --checker-input files if provided
    additional_proxies_loaded = 0
    if args.checker_input:
        from CheckProxies import load_proxies_from_patterns
        additional_proxies = load_proxies_from_patterns(args.checker_input)
        for proxy in additional_proxies:
            proxy_queue.put(proxy)
            # Track these proxies as coming from 'checker-input' source
            proxy_to_sources[proxy].add('checker-input')
            additional_proxies_loaded += 1
        if additional_proxies_loaded > 0:
            stats['checker-input']['scraped'] = additional_proxies_loaded
            print(f"[INFO] Loaded {additional_proxies_loaded} additional proxies from checker-input files", flush=True)

    def on_proxy_scraped(scraper_name, source_detail, proxies_found):
        source_id = get_source_identifier(source_detail, scraper_name)

        # Convert to list and shuffle to avoid feeding sorted chunks of dead proxies to checker
        proxies_list = list(proxies_found)
        random.shuffle(proxies_list)

        with stats_lock:
            stats[source_id]['scraped'] += len(proxies_list)
            for proxy in proxies_list:
                proxy_to_sources[proxy].add(source_id)
                if proxy in checked_cache and checked_cache[proxy]:
                    stats[source_id]['working'] += 1

                if len(proxy_to_sources[proxy]) == 1:
                    proxy_queue.put(proxy)

    def on_proxy_checked(proxy, is_working, details):
        with stats_lock:
            checked_cache[proxy] = is_working
            if is_working:
                for source_id in proxy_to_sources.get(proxy, []):
                    stats[source_id]['working'] += 1

    class ScraperArgsWrapper:
        def __init__(self, original_args, output_file):
            self.__dict__ = original_args.__dict__.copy()
            self.output = output_file
            # Handle both --scraper-threads (creates scraper_threads attr) and
            # --threads (creates threads attr from ScrapeAllProxies standalone usage)
            self.threads = getattr(original_args, 'scraper_threads',
                                   getattr(original_args, 'threads', 50))

    def scraper_worker():
        s_args = ScraperArgsWrapper(args, scraped_file)
        run_scraper_pipeline(s_args, proxy_found_callback=on_proxy_scraped, handle_signals=False, skip_disclaimer=True)
        # Put sentinel value to signal scraper is done
        # If we loaded additional proxies, we need to account for that
        proxy_queue.put(None)

    with termination_context():
        scraper_thread = threading.Thread(target=scraper_worker, name="ScraperOrchestrator")
        scraper_thread.start()

        run_checker_pipeline(args, input_queue=proxy_queue, result_callback=on_proxy_checked)

        scraper_thread.join()

    print("\n" + "="*60)
    print(f"{'Source':<40} | {'Scraped':<10} | {'Working':<10}")
    print("-" * 60)

    sorted_stats = sorted(stats.items(), key=lambda x: x[1]['working'], reverse=True)
    for source, data in sorted_stats:
        print(f"{source[:38]:<40} | {data['scraped']:<10} | {data['working']:<10}")

    unique_scraped = len(proxy_to_sources)
    unique_working = sum(1 for proxy, is_working in checked_cache.items() if is_working)

    print("-" * 60)
    print(f"{'TOTAL (Unique)':<40} | {unique_scraped:<10} | {unique_working:<10}")
    print("="*60)

def main():
    parser = argparse.ArgumentParser(description="ProxyGather: Unified Proxy Scraper and Checker")
    subparsers = parser.add_subparsers(dest='command', help='Command to run')

    def add_scraper_args(p):
        p.add_argument('--output', default=DEFAULT_OUTPUT_FILE)
        # Note: We use 'scraper_threads' as dest to avoid conflict with checker's 'threads'
        p.add_argument('--scraper-threads', type=int, default=50)
        p.add_argument('--automation-threads', type=int, default=3)
        p.add_argument('--turnstile-delay', type=float, default=0)
        p.add_argument('--remove-dead-links', action='store_true')
        p.add_argument('--compliant', action='store_true')
        p.add_argument('--use-browser-automation', action='store_true')
        p.add_argument('-y', '--yes', action='store_true')
        p.add_argument('--only', nargs='*')
        p.add_argument('--exclude', nargs='*', help="Exclude scrapers from the run")
        p.add_argument('-v', '--verbose', action='store_true')

    def add_checker_args(p):
        p.add_argument('--input', nargs='+', default=['scraped-proxies.txt'])
        p.add_argument('--checker-threads', dest='threads', type=int, default=500)
        p.add_argument('--timeout', type=str, default='6s')
        p.add_argument('--prepend-protocol', action='store_true')
        if not any(x.dest == 'verbose' for x in p._actions):
             p.add_argument('-v', '--verbose', action='store_true')

    p_scrape = subparsers.add_parser('scrape')
    add_scraper_args(p_scrape)

    p_check = subparsers.add_parser('check')
    add_checker_args(p_check)

    p_run = subparsers.add_parser('run')
    # Output configuration: separate arguments for scraper and checker
    p_run.add_argument('--scraper-output', help='Output file for scraped proxies (default: scraped-proxies-{timestamp}.txt)')
    p_run.add_argument('--checker-input', nargs='+', help='Additional proxy files to check alongside scraped proxies')
    p_run.add_argument('--checker-output', help='Output file for working proxies (default: working-proxies-{timestamp})')
    p_run.add_argument('--scraper-threads', type=int, default=50)
    p_run.add_argument('--checker-threads', dest='threads', type=int, default=500)

    p_run.add_argument('--automation-threads', type=int, default=3)
    p_run.add_argument('--turnstile-delay', type=float, default=0)
    p_run.add_argument('--remove-dead-links', action='store_true')
    p_run.add_argument('--compliant', action='store_true')
    p_run.add_argument('--use-browser-automation', action='store_true')
    p_run.add_argument('-y', '--yes', action='store_true')
    p_run.add_argument('--only', nargs='*')
    p_run.add_argument('--exclude', nargs='*')
    p_run.add_argument('-v', '--verbose', action='store_true')

    p_run.add_argument('--timeout', type=str, default='6s')
    p_run.add_argument('--prepend-protocol', action='store_true')

    args = parser.parse_args()

    if args.command == 'scrape': cmd_scrape(args)
    elif args.command == 'check': cmd_check(args)
    elif args.command == 'run': cmd_run(args)
    else: parser.print_help()

if __name__ == "__main__":
    main()