diff --git a/.gitignore b/.gitignore index a4133fb..709a1c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .DS_Store __pycache__/* +scrapers/telegram-scraper/config.yml +scrapers/telegram-scraper/*session \ No newline at end of file diff --git a/scrapers/telegram-scraper/README.md b/scrapers/telegram-scraper/README.md new file mode 100644 index 0000000..2317bc7 --- /dev/null +++ b/scrapers/telegram-scraper/README.md @@ -0,0 +1,185 @@ +# Telegram Scraper + +This Python script uses [Telethon](https://github.com/LonamiWebs/Telethon) to scrape messages, images, and videos from a public or private Telegram channel or group. It supports filtering by date range, forward/backward traversal, media downloads, and exports structured JSON data for analysis. + +--- + +## ๐Ÿ“ฆ Features + +- Download messages from a specific Telegram channel or group +- Optionally download images and/or videos +- Filter messages by start and end date +- Store grouped metadata (message ID, grouped ID, timestamp, content) +- Output three structured files: + - Message data + - Channel-wide metadata + - Metadata for the current scraping run + +--- + +## ๐Ÿ›  Requirements + +- Python 3.8+ +- [Telethon](https://docs.telethon.dev/en/stable/) +- PyYAML + +Install dependencies with: + +```bash +pip install -r requirements.txt +``` + +--- + +## ๐Ÿ“„ Configuration + +Create a file named `config.yml` in the same directory as the script with the following structure: + +```yaml +telegram: + api_id: YOUR_API_ID + api_hash: YOUR_API_HASH + session_name: my_session + +scraping: + target_group: https://t.me/your_channel_or_group + limit: 1000 + download_images: true + download_videos: true + start_date: 2024-01-01 + end_date: 2024-12-31 + scrape_forward: false + offset_id: null + offset_date: null +``` + +> โš ๏ธ Do **not** commit your real `config.yml`. Instead, commit the `config_template.yml` to share structure. + +--- + +## ๐Ÿš€ Usage + +```bash +python telegram_scraper.py +``` + +After running, the script will generate a folder like: + +``` +_/ +โ”œโ”€โ”€ images/ (if enabled) +โ”œโ”€โ”€ videos/ (if enabled) +โ”œโ”€โ”€ _.json # message data +โ”œโ”€โ”€ __run_metadata.json +โ”œโ”€โ”€ __channel_metadata.json +``` + +--- + +## ๐Ÿง  How It Works + +- `start_date` and `end_date` are optional filters +- `reverse = false`: Scrape backward from latest messages (default) +- `reverse = true`: Scrape forward from `offset_date` or the oldest message +- Messages are stored in JSON grouped by their `grouped_id` when available + +--- + +## ๐Ÿงผ Example Output + +### Content and message metadata + +```json +{ + "1441684517": { + "7200": { + "grouped_id": 13954964386683269, + "datetime": "2025-04-11T11:22:28+00:00", + "content": "", + "media_saved": [ + "image" + ] + }, + "7199": { + "grouped_id": 13954964386683269, + "datetime": "2025-04-11T11:22:28+00:00", + "content": "", + "media_saved": [ + "image", "video" + ] + }, + "7198": { + "grouped_id": 13954964386683269, + "datetime": "2025-04-11T11:22:28+00:00", + "content": "", + "media_saved": [ + "image" + ] + }, + } +} +``` + +### Current run metadata + +```json +{ + "channel_name": "", + "channel_id": 1441684517, + "channel_url": "https://t.me/url", + "first_message_datetime": "2025-04-10T10:19:20+00:00", + "last_message_datetime": "2025-04-11T11:22:28+00:00", + "message_count": 11, + "image_count": 10, + "video_count": 1, + "config_used": { + "target_group": "https://telegram.me/url", + "limit": 1000, + "start_date": "2025-04-10T00:00:00+00:00", + "end_date": "2025-04-12T00:00:00+00:00", + "scrape_forward": false, + "offset_id": null, + "offset_date": "2025-04-10T00:00:00+00:00", + "download_images": true, + "download_videos": true + } +} +``` + +### Channel metadata + +```json +{ + "channel_name": "name", + "channel_id": 1441684517, + "channel_url": "https://t.me/name", + "total_message_count": 6691, + "first_message_id": 1, + "first_message_datetime": "2022-09-24T07:42:27+00:00", + "last_message_id": 7220, + "last_message_datetime": "2025-04-17T19:00:08+00:00" +} +``` + +--- + +## โœ… Tips + +- Use `offset_date` only when `scrape_forward: true` +- You can test scraping metadata without downloading media by setting: + ```yaml + download_images: false + download_videos: false + ``` + +--- + +## ๐Ÿ›ก Security + +- Your `api_id` and `api_hash` are **sensitive** +- Use `.gitignore` to prevent `config.yml` from being committed: + +```gitignore +scrapers/telegram-scraper/config.yml +scrapers/telegram-scraper/*.session +``` diff --git a/scrapers/telegram-scraper/config_template.yml b/scrapers/telegram-scraper/config_template.yml new file mode 100644 index 0000000..c57ccc4 --- /dev/null +++ b/scrapers/telegram-scraper/config_template.yml @@ -0,0 +1,16 @@ +# Copy (or rename) this file to config.yml and fill in the required fields +telegram: + api_id: 123456 + api_hash: 'f123x123' + session_name: 'name of your session' + +scraping: + target_group: 'https://telegram.me/group_name' + limit: 1000 + scrape_forward: false # Set to True to scrape from oldest to newest. The default order is from newest to oldest + download_images: true + download_videos: true + start_date: '2025-04-10' # Optional: format YYYY-MM-DD + end_date: '2025-04-12' # Optional: format YYYY-MM-DD, exclusive of last date + offset_id: null # Optional: Use specific message ID to start from + offset_date: '2025-04-10' # Optional: Overrides start_date as pagination hint diff --git a/scrapers/telegram-scraper/requirements.txt b/scrapers/telegram-scraper/requirements.txt new file mode 100644 index 0000000..b3008a1 --- /dev/null +++ b/scrapers/telegram-scraper/requirements.txt @@ -0,0 +1,5 @@ +pyaes==1.6.1 +pyasn1==0.6.1 +PyYAML==6.0.2 +rsa==4.9 +Telethon==1.39.0 diff --git a/scrapers/telegram-scraper/telegram_scraper.py b/scrapers/telegram-scraper/telegram_scraper.py new file mode 100644 index 0000000..9629653 --- /dev/null +++ b/scrapers/telegram-scraper/telegram_scraper.py @@ -0,0 +1,176 @@ +import os +import json +import yaml +from datetime import datetime, timezone +from telethon.sync import TelegramClient +from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument + + +def load_config(path='config.yml'): + with open(path, 'r') as file: + return yaml.safe_load(file) + +def parse_date(date_str): + return datetime.strptime(date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc) if date_str else None + +def generate_fallback_photo_id(message): + photo = getattr(message.media, 'photo', None) + if photo and hasattr(photo, 'id'): + return str(photo.id) + else: + return str(int(message.date.timestamp())) + + +if __name__ == "__main__": + config = load_config() + + # Telegram API credentials + api_id = config['telegram']['api_id'] + api_hash = config['telegram']['api_hash'] + session_name = config['telegram']['session_name'] + + # Scraping parameters + target_group = config['scraping']['target_group'] + limit = config['scraping'].get('limit', 1000) + download_images = config['scraping'].get('download_images', False) + download_videos = config['scraping'].get('download_videos', False) + start_date = parse_date(config['scraping'].get('start_date')) + end_date = parse_date(config['scraping'].get('end_date')) + reverse = config['scraping'].get('scrape_forward') + offset_id = config['scraping'].get('offset_id') + offset_date = parse_date(config['scraping'].get('offset_date')) or start_date + + if start_date and end_date and start_date > end_date: + raise ValueError("start_date cannot be after end_date.") + + with TelegramClient(session_name, api_id, api_hash) as client: + entity = client.get_entity(target_group) + group_id = entity.id + group_name = entity.username or entity.title or "unknown" + group_url = f"https://t.me/{entity.username}" if entity.username else None + + folder_name = f"{group_id}_{group_name}".replace(" ", "_") + os.makedirs(folder_name, exist_ok=True) + + images_folder = os.path.join(folder_name, 'images') + videos_folder = os.path.join(folder_name, 'videos') + if download_images: + os.makedirs(images_folder, exist_ok=True) + if download_videos: + os.makedirs(videos_folder, exist_ok=True) + + data = {str(group_id): {}} + first_msg_time = None + last_msg_time = None + image_count = 0 + video_count = 0 + message_count = 0 + + iter_args = {"entity": entity, "limit": limit, "reverse": reverse} + if offset_id is not None: + iter_args["offset_id"] = offset_id + elif offset_date is not None: + # Use offset_date only if reverse is True (forward) + if reverse: + iter_args["offset_date"] = offset_date + + + for message in client.iter_messages(**iter_args): + if not message: + continue + if start_date and message.date < start_date: + continue + if end_date and message.date > end_date: + continue + + msg_id = message.id + grouped_id = getattr(message, 'grouped_id', None) + msg_time = message.date + content = message.message + media_saved = [] + + if isinstance(message.media, MessageMediaPhoto) and download_images: + media_id = generate_fallback_photo_id(message) + filename = os.path.join(images_folder, f"{group_id}_{msg_id}_{media_id}_photo.jpg") + result = client.download_media(message, file=filename) + if result: + image_count += 1 + media_saved.append("image") + + elif isinstance(message.media, MessageMediaDocument) and message.media.document.mime_type.startswith("video/") and download_videos: + media_id = str(getattr(message.media.document, 'id', int(msg_time.timestamp()))) + filename = os.path.join(videos_folder, f"{group_id}_{msg_id}_{media_id}_video.mp4") + result = client.download_media(message, file=filename) + if result: + video_count += 1 + media_saved.append("video") + + if media_saved: + message_count += 1 + first_msg_time = min(first_msg_time or msg_time, msg_time) + last_msg_time = max(last_msg_time or msg_time, msg_time) + + data[str(group_id)][str(msg_id)] = { + "grouped_id": grouped_id, + "datetime": msg_time.isoformat(), + "content": content, + "media_saved": media_saved + } + + first_msg = client.iter_messages(entity, reverse=True).__next__() + last_msg = client.iter_messages(entity).__next__() + total_msg_count = client.get_messages(entity, limit=0).total + + channel_metadata = { + "channel_name": group_name, + "channel_id": group_id, + "channel_url": group_url, + "total_message_count": total_msg_count, + "first_message_id": first_msg.id, + "first_message_datetime": first_msg.date.isoformat(), + "last_message_id": last_msg.id, + "last_message_datetime": last_msg.date.isoformat() + } + + channel_meta_json = os.path.join(folder_name, f"{folder_name}_channel_metadata.json") + with open(channel_meta_json, 'w', encoding='utf-8') as f: + json.dump(channel_metadata, f, ensure_ascii=False, indent=4) + + message_json = os.path.join(folder_name, f"{folder_name}.json") + with open(message_json, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + run_meta = { + "channel_name": group_name, + "channel_id": group_id, + "channel_url": group_url, + "first_message_datetime": first_msg_time.isoformat() if first_msg_time else None, + "last_message_datetime": last_msg_time.isoformat() if last_msg_time else None, + "message_count": message_count, + "image_count": image_count, + "video_count": video_count, + "config_used": { + "target_group": target_group, + "limit": limit, + "start_date": start_date.isoformat() if start_date else None, + "end_date": end_date.isoformat() if end_date else None, + "scrape_forward": reverse, + "offset_id": offset_id, + "offset_date": offset_date.isoformat() if offset_date else None, + "download_images": download_images, + "download_videos": download_videos + } + } + + run_meta_json = os.path.join(folder_name, f"{folder_name}_run_metadata.json") + with open(run_meta_json, 'w', encoding='utf-8') as f: + json.dump(run_meta, f, ensure_ascii=False, indent=4) + + print(f"\nโœ… Scraping complete.") + print(f"- Channel metadata saved to: {channel_meta_json}") + print(f"- Messages saved to: {message_json}") + print(f"- Metadata for the current run saved to: {run_meta_json}") + if download_images: + print(f"- Images saved in: {images_folder}") + if download_videos: + print(f"- Videos saved in: {videos_folder}") \ No newline at end of file