15
15
from selectolax .parser import HTMLParser
16
16
17
17
from src .aws import is_aws_configured
18
- from src .models . utils import from_jsonl , to_jsonl
18
+ from src .local_store import read_meetings , write_meetings
19
19
20
20
from .models .meeting import Meeting
21
21
22
22
BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
23
23
TGOV_BUCKET_NAME = "tgov-meetings"
24
24
MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
25
25
26
-
27
26
async def fetch_page (url : str , session : aiohttp .ClientSession ) -> str :
28
27
"""
29
28
Fetch the HTML content of a page.
@@ -40,11 +39,9 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
40
39
raise Exception (f"Failed to fetch { url } , status code: { response .status } " )
41
40
return await response .text ()
42
41
43
-
44
42
def clean_date (date : str ) -> str :
45
43
return re .sub (r"\s+" , " " , date ).strip ()
46
44
47
-
48
45
async def parse_meetings (html : str ) -> List [Dict [str , str ]]:
49
46
"""
50
47
Parse the meeting data from the HTML content.
@@ -67,21 +64,18 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
67
64
# Process each table
68
65
for table in tables :
69
66
for row in table .css ("tr.listingRow" ):
70
- cells = row .css ("td" )
71
67
name_cells = row .css ('td.listItem[headers^="Name"]' )
72
68
meeting_name = name_cells [0 ].text ().strip () if name_cells else "Unknown"
73
69
74
70
date_cells = row .css ('td.listItem[headers^="Date"]' )
75
71
raw_date = clean_date (date_cells [0 ].text ().strip ()) if date_cells else "Unknown"
76
72
meeting_date = raw_date .split ("-" )[0 ].strip () if "-" in raw_date else raw_date
77
73
78
-
79
74
duration_cells = row .css ('td.listItem[headers^="Duration"]' )
80
75
duration_str = duration_cells [0 ].text ().strip () if duration_cells else "Unknown"
81
76
minutes = duration_to_minutes (duration_str )
82
77
meeting_duration = f"{ minutes // 60 } :{ minutes % 60 :02d} " if minutes is not None else "Unknown"
83
78
84
-
85
79
meeting_data = {
86
80
"meeting" : meeting_name ,
87
81
"date" : meeting_date ,
@@ -131,7 +125,6 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
131
125
132
126
return meetings
133
127
134
-
135
128
async def get_tgov_meetings () -> Sequence [Meeting ]:
136
129
"""
137
130
Fetch and parse meeting data from the Government Access Television website.
@@ -147,7 +140,6 @@ async def get_tgov_meetings() -> Sequence[Meeting]:
147
140
meetings = [Meeting (** meeting_dict ) for meeting_dict in meeting_dicts ]
148
141
return meetings
149
142
150
-
151
143
def duration_to_minutes (duration ):
152
144
if not duration or pd .isna (duration ):
153
145
return None
@@ -172,43 +164,25 @@ def duration_to_minutes(duration):
172
164
except :
173
165
return None
174
166
175
-
176
167
def get_registry_meetings () -> Sequence [Meeting ]:
177
168
if is_aws_configured ():
178
- print (f'Getting registry from AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
179
- import boto3
180
- from botocore .exceptions import ClientError
181
- s3 = boto3 .client ('s3' )
182
- try :
183
- registry_response = s3 .get_object (Bucket = TGOV_BUCKET_NAME , Key = MEETINGS_REGISTRY_PATH )
184
- registry_body = registry_response ['Body' ].read ().decode ('utf-8' )
185
- return from_jsonl (registry_body , Meeting )
186
- except ClientError as e :
187
- if e .response ['Error' ]['Code' ] == 'NoSuchKey' :
188
- print ('No registry file found on S3. Returning empty list.' )
189
-
190
- return []
191
-
169
+ print (f'Getting registry from DynamoDB.' )
170
+ return list (Meeting .scan ())
171
+ else :
172
+ print (f'Getting registry from local store' )
173
+ return read_meetings ()
192
174
193
175
def write_registry_meetings (meetings : Sequence [Meeting ]) -> Sequence [Meeting ]:
194
- jsonl_str = to_jsonl (meetings )
195
-
196
176
if is_aws_configured ():
197
- print (f'Writing registry to AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
198
- import boto3
199
- from botocore .exceptions import ClientError
200
- s3 = boto3 .client ('s3' )
201
-
202
- try :
203
- s3 .put_object (
204
- Bucket = TGOV_BUCKET_NAME ,
205
- Key = MEETINGS_REGISTRY_PATH ,
206
- Body = jsonl_str ,
207
- ContentType = 'application/x-ndjson'
208
- )
209
- print (f'Wrote { len (meetings )} meetings to S3.' )
210
- except ClientError as e :
211
- print (f"Failed to write to S3: { e } " )
212
- raise
177
+ print (f'Writing registry to DynamoDB.' )
178
+ with Meeting .batch_writer ():
179
+ for meeting in meetings :
180
+ if meeting .clip_id :
181
+ meeting .save ()
182
+ else :
183
+ print (f'Skipping meeting with missing clip_id: { meeting } ' )
184
+ else :
185
+ print (f'Writing registry to local store' )
186
+ write_meetings (meetings )
213
187
214
188
return meetings
0 commit comments