Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 95 additions & 125 deletions feed_generators/openai_research_blog.py
Original file line number Diff line number Diff line change
@@ -1,154 +1,124 @@
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pytz
from feedgen.feed import FeedGenerator
import time
import logging
from email.utils import parsedate_to_datetime
from pathlib import Path

# Set up logging
import requests
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator

RSS_URL = "https://openai.com/news/rss.xml"
BLOG_URL = "https://openai.com/news/research/"
CATEGORY = "Research"
FEED_NAME = "openai_research"

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def stable_fallback_date(identifier):
"""Generate a stable date from a URL or title hash."""
hash_val = abs(hash(identifier)) % 730
epoch = datetime(2023, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)
return epoch + timedelta(days=hash_val)
def get_project_root():
"""Get the project root directory."""
return Path(__file__).parent.parent


def setup_selenium_driver():
"""Set up Selenium WebDriver with undetected-chromedriver."""
options = uc.ChromeOptions()
options.add_argument("--headless") # Ensure headless mode is enabled
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
return uc.Chrome(options=options)


def fetch_news_content_selenium(url):
"""Fetch the fully loaded HTML content of a webpage using Selenium."""
driver = None
try:
logger.info(f"Fetching content from URL: {url}")
driver = setup_selenium_driver()
driver.get(url)

# Log wait time
wait_time = 5
logger.info(f"Waiting {wait_time} seconds for the page to fully load...")
time.sleep(wait_time)

html_content = driver.page_source
logger.info("Successfully fetched HTML content")
return html_content

except Exception as e:
logger.error(f"Error fetching content: {e}")
raise
finally:
if driver:
driver.quit()


def parse_openai_news_html(html_content):
"""Parse the HTML content from OpenAI's Research News page."""
soup = BeautifulSoup(html_content, "html.parser")
articles = []

# Extract news items that contain `/index` in the href
news_items = soup.select("a[href*='/index']") # Look for links containing '/index'

for item in news_items:
try:
# Extract title
title_elem = item.select_one("div.line-clamp-4")
if not title_elem:
continue
title = title_elem.text.strip()

# Extract link
link = "https://openai.com" + item["href"]

# Extract date
date_elem = item.select_one("span.text-small")
if date_elem:
try:
date = datetime.strptime(date_elem.text.strip(), "%b %d, %Y")
date = date.replace(tzinfo=pytz.UTC)
except Exception:
logger.warning(f"Date parsing failed for article: {title}")
date = stable_fallback_date(link)
else:
date = stable_fallback_date(link)

articles.append(
{
"title": title,
"link": link,
"date": date,
"category": "Research",
"description": title,
}
)
except Exception as e:
logger.warning(f"Skipping an article due to parsing error: {e}")
def fetch_rss_content(url: str = RSS_URL) -> str:
"""Fetch the official OpenAI news RSS feed."""
logger.info("Fetching RSS content from %s", url)
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return response.text


def parse_research_posts(rss_content: str) -> list[dict]:
"""Parse Research-tagged items from the official OpenAI RSS feed."""
soup = BeautifulSoup(rss_content, "xml")
posts = []

for item in soup.find_all("item"):
category = item.find("category")
if not category or category.get_text(strip=True) != CATEGORY:
continue

title = item.find("title")
link = item.find("link")
description = item.find("description")
pub_date = item.find("pubDate")

if not title or not link:
logger.warning("Skipping item missing title or link")
continue

logger.info(f"Parsed {len(articles)} articles")
return articles
parsed_date = None
if pub_date and pub_date.get_text(strip=True):
parsed_date = parsedate_to_datetime(pub_date.get_text(strip=True))

posts.append(
{
"title": title.get_text(strip=True),
"link": link.get_text(strip=True),
"description": description.get_text(strip=True) if description else "",
"date": parsed_date,
"category": CATEGORY,
}
)

logger.info("Parsed %s research posts", len(posts))
return posts

def generate_rss_feed(articles, feed_name="openai_research"):
"""Generate RSS feed from parsed articles."""

def generate_rss_feed(posts: list[dict]) -> FeedGenerator:
"""Generate the RSS feed for OpenAI Research posts."""
fg = FeedGenerator()
fg.title("OpenAI Research News")
fg.description("Latest research news and updates from OpenAI")
fg.link(href="https://openai.com/news/research")
fg.link(href=BLOG_URL)
fg.language("en")

for article in articles:
# Sort posts by date, newest first (handle None dates)
sorted_posts = sorted(
posts,
key=lambda p: p["date"] if p["date"] else "1970-01-01",
reverse=True
)

for post in sorted_posts:
fe = fg.add_entry()
fe.title(article["title"])
fe.link(href=article["link"])
fe.description(article["description"])
fe.published(article["date"])
fe.category(term=article["category"])
fe.title(post["title"])
fe.link(href=post["link"])
fe.description(post["description"])

if post["date"] is not None:
fe.published(post["date"])

fe.category(term=post["category"])

logger.info("RSS feed generated successfully")
return fg


def save_rss_feed(feed_generator, feed_name="openai_research"):
"""Save RSS feed to an XML file."""
feeds_dir = Path("feeds")
def main() -> None:
"""Generate and save the OpenAI Research RSS feed."""
rss_content = fetch_rss_content()
posts = parse_research_posts(rss_content)

if not posts:
logger.warning("No research posts found in the OpenAI RSS feed")
# Still generate an empty feed (preserving structure)
posts = []

feeds_dir = get_project_root() / "feeds"
feeds_dir.mkdir(exist_ok=True)
output_file = feeds_dir / f"feed_{feed_name}.xml"
feed_generator.rss_file(str(output_file), pretty=True)
logger.info(f"RSS feed saved to {output_file}")
return output_file


def main():
"""Main function to generate OpenAI Research News RSS feed."""
url = "https://openai.com/news/research/?limit=500"

try:
html_content = fetch_news_content_selenium(url)
articles = parse_openai_news_html(html_content)
if not articles:
logger.warning("No articles were parsed. Check your selectors.")
feed = generate_rss_feed(articles)
save_rss_feed(feed)
except Exception as e:
logger.error(f"Failed to generate RSS feed: {e}")
output_file = feeds_dir / f"feed_{FEED_NAME}.xml"

generate_rss_feed(posts).rss_file(str(output_file), pretty=True)
logger.info("Saved RSS feed to %s", output_file)


if __name__ == "__main__":
Expand Down
Loading