Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
3fcb775
create custom site scrapers using LLM-based CSS extraction approach
natea Mar 4, 2025
a4a5718
allow user to create custom scrapers using an
natea Mar 4, 2025
fb928f9
restore the scraper to it's previously working glory (still need to r…
natea Mar 4, 2025
f20281b
the scraper is now getting all the lilypad images, but still missing …
natea Mar 4, 2025
9f2455a
image parsing is working on sites like lilypad that use data-src inst…
natea Mar 4, 2025
76733ea
scraper can now properly parse berklee event date/time strings
natea Mar 4, 2025
78b19ad
Added privacy policy and terms of service pages, and simple logo
natea Mar 4, 2025
f4f7409
add the privacy and terms of service links to the signup page
natea Mar 4, 2025
369adde
make a styled page for password reset done
natea Mar 4, 2025
0cd5ac4
replace the import events screen with the site scrapers functionality
natea Mar 4, 2025
95a5f40
Fix migration error by increasing Spotify field lengths to 255 charac…
natea Mar 4, 2025
8372e01
Fix migration error by modifying migration 0008 to use 255 chars for …
natea Mar 4, 2025
0c0a838
Fix logo rendering issue by using simpler WhiteNoise storage
natea Mar 4, 2025
f84a38d
update to use the latest crawl4ai that has the 'generate_schema' in …
natea Mar 4, 2025
c863ab6
Fix logo display by enhancing WhiteNoise configuration for static files
natea Mar 4, 2025
4f72558
Fix production deployment: add required apps to INSTALLED_APPS
natea Mar 4, 2025
2e699e8
Fix production deployment: update crawl4ai version to fix JsonCssExtr…
natea Mar 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,15 @@ env/

# Render
.render/

# Sample files
sample.html
website_html_content.html
luma_boston_sample.html
db.sqlite3.backup-2025-03-03
tockify_debug_logs.txt
tockify_full_html.txt
tockify_html_sample.txt
events/scrapers/javascript_to_wait.js
debug_tockify.py
events/scrapers/revert_site_scraper.py
Empty file added core/tests/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions core/tests/test_logo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pytest
from django.urls import reverse
from django.conf import settings

@pytest.mark.django_db
class TestLogo:
def test_logo_in_header(self, client):
"""Test that the logo is included in the header"""
response = client.get(reverse('core:privacy'))
content = response.content.decode('utf-8')

# Check if the logo image tag is in the response
assert '<img src="/static/images/logo.png"' in content
assert 'alt="SocialCal Logo"' in content
16 changes: 16 additions & 0 deletions core/tests/test_privacy_terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest
from django.urls import reverse

@pytest.mark.django_db
class TestPrivacyTermsViews:
def test_privacy_view(self, client):
"""Test that the privacy policy page loads correctly"""
response = client.get(reverse('core:privacy'))
assert response.status_code == 200
assert 'Privacy Policy' in str(response.content)

def test_terms_of_service_view(self, client):
"""Test that the terms of service page loads correctly"""
response = client.get(reverse('core:terms_of_service'))
assert response.status_code == 200
assert 'Terms of Service' in str(response.content)
2 changes: 2 additions & 0 deletions core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@
path('about/', views.about, name='about'),
path('contact/', views.contact, name='contact'),
path('search/', views.search, name='search'),
path('privacy/', views.privacy, name='privacy'),
path('terms-of-service/', views.terms_of_service, name='terms_of_service'),
]
14 changes: 13 additions & 1 deletion core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,16 @@ def search(request):
query = request.GET.get('q', '')
# Add search logic here
context = {'query': query, 'results': []}
return render(request, 'core/search.html', context)
return render(request, 'core/search.html', context)

def privacy(request):
"""
View for the privacy policy page.
"""
return render(request, 'core/privacy.html')

def terms_of_service(request):
"""
View for the terms of service page.
"""
return render(request, 'core/terms_of_service.html')
76 changes: 75 additions & 1 deletion events/admin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from django.contrib import admin
from django.contrib.auth import get_user_model
from .models import Event
from .models import Event, SiteScraper

class UserFilter(admin.SimpleListFilter):
title = 'User'
Expand Down Expand Up @@ -53,3 +53,77 @@ def make_public(self, request, queryset):
def make_private(self, request, queryset):
queryset.update(is_public=False)
make_private.short_description = "Make selected events private"

def delete_queryset(self, request, queryset):
"""
Override the delete_queryset method to delete related records before deleting the events.
This is needed because there are foreign key relationships with the Event model.
"""
# Get the IDs of the events to delete
event_ids = list(queryset.values_list('id', flat=True))

# Delete related records in events_event_labels
from django.db import connection
with connection.cursor() as cursor:
# Use %s as Django will convert it to the appropriate placeholder for the database
placeholders = ','.join(['%s' for _ in event_ids])

# Delete related records in events_event_labels
cursor.execute(f'DELETE FROM events_event_labels WHERE event_id IN ({placeholders})', event_ids)

# Delete related records in events_eventresponse
cursor.execute(f'DELETE FROM events_eventresponse WHERE event_id IN ({placeholders})', event_ids)

# Delete related records in events_starredevent
cursor.execute(f'DELETE FROM events_starredevent WHERE event_id IN ({placeholders})', event_ids)

# Now delete the events
super().delete_queryset(request, queryset)

class SiteScraperUserFilter(admin.SimpleListFilter):
title = 'User'
parameter_name = 'user'

def lookups(self, request, model_admin):
users = get_user_model().objects.filter(
id__in=SiteScraper.objects.values_list('user_id', flat=True)
).order_by('username')
return [(user.id, user.username) for user in users]

def queryset(self, request, queryset):
if self.value():
return queryset.filter(user_id=self.value())
return queryset

@admin.register(SiteScraper)
class SiteScraperAdmin(admin.ModelAdmin):
list_display = [
'name',
'url',
'user',
'is_active',
'last_tested',
'created_at'
]
list_filter = [
SiteScraperUserFilter,
'is_active',
'created_at',
'last_tested'
]
search_fields = [
'name',
'url',
'description'
]
date_hierarchy = 'created_at'
ordering = ['-created_at']
actions = ['make_active', 'make_inactive']

def make_active(self, request, queryset):
queryset.update(is_active=True)
make_active.short_description = "Make selected scrapers active"

def make_inactive(self, request, queryset):
queryset.update(is_active=False)
make_inactive.short_description = "Make selected scrapers inactive"
55 changes: 53 additions & 2 deletions events/forms.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from django import forms
from .models import Event
from .models import Event, SiteScraper
import pytz
from django.conf import settings
from django.utils import timezone
from .utils.spotify import SpotifyAPI
import json
from django.core.exceptions import ValidationError

class EventForm(forms.ModelForm):
timezone = forms.ChoiceField(
Expand Down Expand Up @@ -132,4 +134,53 @@ def clean(self):
self.add_error('end_time', 'End time must be after start time')
self.add_error('start_time', 'Start time must be before end time')

return cleaned_data
return cleaned_data

class SiteScraperForm(forms.ModelForm):
"""Form for creating and editing site scrapers."""

css_schema_json = forms.CharField(
widget=forms.Textarea(attrs={'rows': 10, 'class': 'form-control'}),
required=False,
help_text="JSON schema for CSS selectors. Will be auto-generated if left blank."
)

class Meta:
model = SiteScraper
fields = ['name', 'url', 'description', 'is_active']
widgets = {
'name': forms.TextInput(attrs={'class': 'form-control'}),
'url': forms.URLInput(attrs={'class': 'form-control'}),
'description': forms.Textarea(attrs={'rows': 3, 'class': 'form-control'}),
'is_active': forms.CheckboxInput(attrs={'class': 'form-check-input'}),
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# If we're editing an existing scraper, populate the css_schema_json field
if self.instance.pk and self.instance.css_schema:
self.fields['css_schema_json'].initial = json.dumps(self.instance.css_schema, indent=2)

def clean_css_schema_json(self):
"""Validate that the CSS schema is valid JSON."""
css_schema_json = self.cleaned_data.get('css_schema_json')
if css_schema_json:
try:
return json.loads(css_schema_json)
except json.JSONDecodeError:
raise ValidationError("Invalid JSON format")
return {}

def save(self, commit=True):
"""Save the form and update the css_schema field."""
scraper = super().save(commit=False)

# Update the css_schema field with the parsed JSON
css_schema_json = self.cleaned_data.get('css_schema_json')
if css_schema_json:
scraper.css_schema = css_schema_json

if commit:
scraper.save()

return scraper
1 change: 1 addition & 0 deletions events/management/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Django management commands
1 change: 1 addition & 0 deletions events/management/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Django management commands for events app
23 changes: 23 additions & 0 deletions events/management/commands/test_website_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import asyncio
from django.core.management.base import BaseCommand

class Command(BaseCommand):
help = 'Test the website scraper with the updated date parser'

def add_arguments(self, parser):
parser.add_argument('url', type=str, help='URL to test the scraper with')

def handle(self, *args, **options):
from events.scrapers.test_website import test_website_scraper
url = options['url']
self.stdout.write(f'Testing website scraper with URL: {url}')

# Run the async function
loop = asyncio.get_event_loop()
events = loop.run_until_complete(test_website_scraper(url))

# Display summary
if events:
self.stdout.write(self.style.SUCCESS(f'Successfully extracted {len(events)} events'))
else:
self.stdout.write(self.style.ERROR('Failed to extract events'))
65 changes: 65 additions & 0 deletions events/migrations/0008_alter_event_spotify_artist_id_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Generated by Django 4.2.9 on 2025-03-03 21:21

from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('events', '0007_merge_20250127_0043'),
]

operations = [
migrations.AlterField(
model_name='event',
name='spotify_artist_id',
field=models.CharField(blank=True, max_length=255),
),
migrations.AlterField(
model_name='event',
name='spotify_artist_name',
field=models.CharField(blank=True, max_length=255),
),
migrations.AlterField(
model_name='event',
name='spotify_external_url',
field=models.URLField(blank=True, max_length=500),
),
migrations.AlterField(
model_name='event',
name='spotify_preview_url',
field=models.URLField(blank=True, max_length=500),
),
migrations.AlterField(
model_name='event',
name='spotify_track_id',
field=models.CharField(blank=True, max_length=255),
),
migrations.AlterField(
model_name='event',
name='spotify_track_name',
field=models.CharField(blank=True, max_length=255),
),
migrations.CreateModel(
name='SiteScraper',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=200)),
('url', models.URLField(max_length=500)),
('description', models.TextField(blank=True)),
('css_schema', models.JSONField(default=dict)),
('last_tested', models.DateTimeField(blank=True, null=True)),
('test_results', models.JSONField(blank=True, default=dict)),
('is_active', models.BooleanField(default=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='site_scrapers', to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
]
43 changes: 38 additions & 5 deletions events/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ class Event(models.Model):
image_url = models.URLField(max_length=500, blank=True)

# Spotify Integration
spotify_track_id = models.CharField(max_length=100, blank=True)
spotify_track_name = models.CharField(max_length=200, blank=True)
spotify_artist_id = models.CharField(max_length=100, blank=True)
spotify_artist_name = models.CharField(max_length=200, blank=True)
spotify_track_id = models.CharField(max_length=255, blank=True)
spotify_track_name = models.CharField(max_length=255, blank=True)
spotify_artist_id = models.CharField(max_length=255, blank=True)
spotify_artist_name = models.CharField(max_length=255, blank=True)
spotify_preview_url = models.URLField(max_length=500, blank=True)
spotify_external_url = models.URLField(max_length=500, blank=True)

Expand Down Expand Up @@ -65,4 +65,37 @@ def location(self):

def get_full_address(self):
"""Return the full address as a string."""
return self.location
return self.location

class SiteScraper(models.Model):
"""Model to store site scraper configurations with CSS extraction strategies."""
user = models.ForeignKey(
settings.AUTH_USER_MODEL,
on_delete=models.CASCADE,
related_name='site_scrapers'
)
name = models.CharField(max_length=200)
url = models.URLField(max_length=500)
description = models.TextField(blank=True)

# CSS Extraction Strategy
css_schema = models.JSONField(default=dict)

# Last test results
last_tested = models.DateTimeField(null=True, blank=True)
test_results = models.JSONField(default=dict, blank=True)

# Settings
is_active = models.BooleanField(default=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

class Meta:
app_label = 'events'
ordering = ['name']

def __str__(self):
return self.name

def get_absolute_url(self):
return reverse('events:scraper_detail', kwargs={'pk': self.pk})
Loading
Loading