Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/run_feeds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@ concurrency:
group: request-feeds
cancel-in-progress: true

permissions:
contents: write

jobs:
run-feeds:
runs-on: ubuntu-latest
timeout-minutes: 30
env:
RSS_REPO_SLUG: ${{ github.repository }}

steps:
- name: Checkout repository
Expand Down Expand Up @@ -49,5 +54,5 @@ jobs:
echo "No changes to commit"
else
git commit -m 'Update RSS feeds'
git push || { git pull --rebase && git push; }
git push origin HEAD:main || { git pull --rebase origin main && git push origin HEAD:main; }
fi
7 changes: 6 additions & 1 deletion .github/workflows/run_selenium_feeds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@ concurrency:
group: selenium-feeds
cancel-in-progress: true

permissions:
contents: write

jobs:
run-selenium-feeds:
runs-on: ubuntu-latest
timeout-minutes: 60
env:
RSS_REPO_SLUG: ${{ github.repository }}

steps:
- name: Checkout repository
Expand Down Expand Up @@ -55,5 +60,5 @@ jobs:
echo "No changes to commit"
else
git commit -m 'Update RSS feeds (Selenium)'
git push || { git pull --rebase && git push; }
git push origin HEAD:main || { git pull --rebase origin main && git push origin HEAD:main; }
fi
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ Scraped feeds are generated hourly. "Official RSS" rows point to native feeds th
| [Google DeepMind Blog](https://deepmind.google/blog/) | [Official RSS](https://deepmind.google/blog/rss.xml) |
| [Google Developers Blog - AI](https://developers.googleblog.com/search/?technology_categories=AI) | [feed_google_ai.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_google_ai.xml) |
| [Groq Blog](https://groq.com/blog/) | [feed_groq.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_groq.xml) |
| [Hugging Face Blog](https://huggingface.co/blog) | [Official RSS](https://huggingface.co/blog/feed.xml) |
| [Hugging Face Blog (Ethics)](https://huggingface.co/blog?tag=ethics) | [feed_huggingface_ethics.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_huggingface_ethics.xml) |
| [Hugging Face Blog (Research)](https://huggingface.co/blog?tag=research) | [feed_huggingface_research.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_huggingface_research.xml) |
| [Hamel Husain's Blog](https://hamel.dev/) | [Official RSS](https://hamel.dev/index.xml) |
| [Interconnected (Matt Webb)](https://interconnected.org/home) | [Official RSS](https://interconnected.org/home/feed) |
| [Mistral AI News](https://mistral.ai/news) | [feed_mistral.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_mistral.xml) |
Expand All @@ -38,6 +41,7 @@ Scraped feeds are generated hourly. "Official RSS" rows point to native feeds th
| [Perplexity Hub](https://www.perplexity.ai/hub) | [feed_perplexity_hub.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_perplexity_hub.xml) |
| [Pinecone Blog](https://www.pinecone.io/blog/) | [feed_pinecone.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_pinecone.xml) |
| [Simon Willison's Blog (Tools)](https://simonwillison.net/) | [Official RSS](https://simonwillison.net/atom/beats/tool/) |
| [Stanford HAI News](https://hai.stanford.edu/news) | [feed_stanford_hai_news.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_stanford_hai_news.xml) |
| [Supabase Blog](https://supabase.com/blog) | [Official RSS](https://supabase.com/rss.xml) |
| [Surge AI Blog](https://www.surgehq.ai/blog) | [feed_blogsurgeai.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_blogsurgeai.xml) |
| [The Batch by DeepLearning.AI](https://www.deeplearning.ai/the-batch/) | [feed_the_batch.xml](https://raw.githubusercontent.com/Olshansk/rss-feeds/main/feeds/feed_the_batch.xml) |
Expand Down
184 changes: 184 additions & 0 deletions feed_generators/huggingface_blog_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""Shared helpers for Hugging Face blog tag feeds."""

from datetime import datetime

import pytz
import requests
from feedgen.feed import FeedGenerator

from utils import (
deserialize_entries,
load_cache,
merge_entries,
save_cache,
save_rss_feed,
setup_feed_links,
setup_logging,
sort_posts_for_feed,
stable_fallback_date,
)

logger = setup_logging(__name__)

HF_API_URL = "https://huggingface.co/api/blog"
HF_BASE_URL = "https://huggingface.co"
API_HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; RSS Feed Generator)",
"Accept": "application/json",
}


def fetch_posts_page(tag: str, page: int) -> dict:
"""Fetch a single page of blog posts for a tag from the Hugging Face API."""
response = requests.get(
HF_API_URL,
params={"tag": tag, "p": page},
headers=API_HEADERS,
timeout=30,
)
response.raise_for_status()
return response.json()


def parse_api_posts(blogs: list[dict]) -> list[dict]:
"""Extract post dicts from Hugging Face API blog objects."""
posts = []
for blog in blogs:
title = (blog.get("title") or "").strip()
if not title:
continue

url = blog.get("url") or f"/blog/{blog.get('slug', '')}"
link = f"{HF_BASE_URL}{url}" if url.startswith("/") else url

date = None
published_at = blog.get("publishedAt")
if published_at:
try:
date = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
if date.tzinfo is None:
date = date.replace(tzinfo=pytz.UTC)
except ValueError:
logger.warning(f"Could not parse date for: {title}")
if not date:
date = stable_fallback_date(link)

tags = blog.get("tags") or []
description = title
if tags:
description = f"{title} ({', '.join(tags)})"

posts.append(
{
"title": title,
"link": link,
"date": date,
"description": description,
"category": tags[0] if tags else "Blog",
}
)
return posts


def fetch_all_posts(tag: str) -> list[dict]:
"""Fetch all posts for a tag across paginated API results."""
all_posts: list[dict] = []
seen_links: set[str] = set()
page = 0

while True:
logger.info(f"Fetching page {page} for tag={tag!r}")
api_data = fetch_posts_page(tag, page)
blogs = api_data.get("allBlogs", [])
if not blogs:
logger.info(f"No posts returned on page {page}, stopping")
break

page_posts = parse_api_posts(blogs)
for post in page_posts:
if post["link"] not in seen_links:
all_posts.append(post)
seen_links.add(post["link"])

total = api_data.get("numTotalItems", len(all_posts))
logger.info(f"Page {page}: {len(page_posts)} posts (total: {len(all_posts)}/{total})")
if len(all_posts) >= total:
break
page += 1

return all_posts


def fetch_latest_posts(tag: str) -> list[dict]:
"""Fetch only the newest page of posts for incremental updates."""
api_data = fetch_posts_page(tag, page=0)
posts = parse_api_posts(api_data.get("allBlogs", []))
logger.info(f"Fetched {len(posts)} latest posts for tag={tag!r}")
return posts


def generate_rss_feed(
posts: list[dict],
*,
feed_name: str,
blog_url: str,
feed_title: str,
feed_description: str,
) -> FeedGenerator:
fg = FeedGenerator()
fg.title(feed_title)
fg.description(feed_description)
fg.language("en")
fg.author({"name": "Hugging Face"})
setup_feed_links(fg, blog_url=blog_url, feed_name=feed_name)

for post in sort_posts_for_feed(posts, date_field="date"):
fe = fg.add_entry()
fe.title(post["title"])
fe.description(post["description"])
fe.link(href=post["link"])
fe.id(post["link"])
fe.category(term=post["category"])
if post.get("date"):
fe.published(post["date"])

logger.info(f"Generated RSS feed with {len(posts)} entries")
return fg


def run_tag_feed(
*,
tag: str,
feed_name: str,
blog_url: str,
feed_title: str,
feed_description: str,
full_reset: bool = False,
) -> bool:
cache = load_cache(feed_name)
cached_entries = deserialize_entries(cache.get("entries", []))

if full_reset or not cached_entries:
mode = "full reset" if full_reset else "no cache exists"
logger.info(f"Running full fetch ({mode}) for tag={tag!r}")
posts = sort_posts_for_feed(fetch_all_posts(tag), date_field="date")
else:
logger.info(f"Running incremental update for tag={tag!r}")
new_posts = fetch_latest_posts(tag)
posts = merge_entries(new_posts, cached_entries)

if not posts:
logger.warning(f"No posts found for tag={tag!r}. Check the Hugging Face API response.")
return False

save_cache(feed_name, posts)
feed = generate_rss_feed(
posts,
feed_name=feed_name,
blog_url=blog_url,
feed_title=feed_title,
feed_description=feed_description,
)
save_rss_feed(feed, feed_name)
logger.info("Done!")
return True
30 changes: 30 additions & 0 deletions feed_generators/huggingface_ethics_blog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Generate RSS feed for Hugging Face Blog posts tagged ethics."""

import argparse

from huggingface_blog_common import run_tag_feed
from utils import setup_logging

logger = setup_logging()

FEED_NAME = "huggingface_ethics"
BLOG_URL = "https://huggingface.co/blog?tag=ethics"
TAG = "ethics"


def main(full_reset: bool = False) -> bool:
return run_tag_feed(
tag=TAG,
feed_name=FEED_NAME,
blog_url=BLOG_URL,
feed_title="Hugging Face Blog (Ethics)",
feed_description="Ethics posts from the Hugging Face blog",
full_reset=full_reset,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate Hugging Face ethics blog RSS feed")
parser.add_argument("--full", action="store_true", help="Force full reset (fetch all tagged posts)")
args = parser.parse_args()
main(full_reset=args.full)
30 changes: 30 additions & 0 deletions feed_generators/huggingface_research_blog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Generate RSS feed for Hugging Face Blog posts tagged research."""

import argparse

from huggingface_blog_common import run_tag_feed
from utils import setup_logging

logger = setup_logging()

FEED_NAME = "huggingface_research"
BLOG_URL = "https://huggingface.co/blog?tag=research"
TAG = "research"


def main(full_reset: bool = False) -> bool:
return run_tag_feed(
tag=TAG,
feed_name=FEED_NAME,
blog_url=BLOG_URL,
feed_title="Hugging Face Blog (Research)",
feed_description="Research posts from the Hugging Face blog",
full_reset=full_reset,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate Hugging Face research blog RSS feed")
parser.add_argument("--full", action="store_true", help="Force full reset (fetch all tagged posts)")
args = parser.parse_args()
main(full_reset=args.full)
Loading
Loading