Skip to content

Commit 9363095

Browse files
authored
Validate exclusion regexes on backend (#2316)
1 parent 763c654 commit 9363095

File tree

5 files changed

+80
-3
lines changed

5 files changed

+80
-3
lines changed

backend/btrixcloud/crawlconfigs.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
CrawlerProxy,
4444
CrawlerProxies,
4545
)
46-
from .utils import dt_now, slug_from_name
46+
from .utils import dt_now, slug_from_name, validate_regexes
4747

4848
if TYPE_CHECKING:
4949
from .orgs import OrgOps
@@ -189,7 +189,7 @@ async def get_profile_filename(
189189

190190
return profile_filename
191191

192-
# pylint: disable=invalid-name
192+
# pylint: disable=invalid-name, too-many-branches
193193
async def add_crawl_config(
194194
self,
195195
config_in: CrawlConfigIn,
@@ -215,6 +215,12 @@ async def add_crawl_config(
215215
if not self.can_org_use_proxy(org, config_in.proxyId):
216216
raise HTTPException(status_code=404, detail="proxy_not_found")
217217

218+
if config_in.config.exclude:
219+
exclude = config_in.config.exclude
220+
if isinstance(exclude, str):
221+
exclude = [exclude]
222+
validate_regexes(exclude)
223+
218224
now = dt_now()
219225
crawlconfig = CrawlConfig(
220226
id=uuid4(),
@@ -317,11 +323,17 @@ def check_attr_changed(
317323
async def update_crawl_config(
318324
self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig
319325
) -> dict[str, bool | str]:
320-
# pylint: disable=too-many-locals
326+
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
321327
"""Update name, scale, schedule, and/or tags for an existing crawl config"""
322328

323329
orig_crawl_config = await self.get_crawl_config(cid, org.id)
324330

331+
if update.config and update.config.exclude:
332+
exclude = update.config.exclude
333+
if isinstance(exclude, str):
334+
exclude = [exclude]
335+
validate_regexes(exclude)
336+
325337
# indicates if any k8s crawl config settings changed
326338
changed = False
327339
changed = changed or (

backend/btrixcloud/crawls.py

+4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
date_to_str,
2525
parse_jsonl_error_messages,
2626
stream_dict_list_as_csv,
27+
validate_regexes,
2728
)
2829
from .basecrawls import BaseCrawlOps
2930
from .crawlmanager import CrawlManager
@@ -517,6 +518,9 @@ async def add_or_remove_exclusion(
517518
"""add new exclusion to config or remove exclusion from config
518519
for given crawl_id, update config on crawl"""
519520

521+
if add:
522+
validate_regexes([regex])
523+
520524
crawl = await self.get_crawl(crawl_id, org)
521525

522526
if crawl.state not in RUNNING_AND_WAITING_STATES:

backend/btrixcloud/utils.py

+10
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,13 @@ def get_origin(headers) -> str:
194194
return default_origin
195195

196196
return scheme + "://" + host
197+
198+
199+
def validate_regexes(regexes: List[str]):
200+
"""Validate regular expressions, raise HTTPException if invalid"""
201+
for regex in regexes:
202+
try:
203+
re.compile(regex)
204+
except re.error:
205+
# pylint: disable=raise-missing-from
206+
raise HTTPException(status_code=400, detail="invalid_regex")

backend/test/test_crawlconfigs.py

+42
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,26 @@ def test_update_config_invalid_format(
153153
assert r.status_code == 422
154154

155155

156+
def test_update_config_invalid_exclude_regex(
157+
crawler_auth_headers, default_org_id, sample_crawl_data
158+
):
159+
r = requests.patch(
160+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
161+
headers=crawler_auth_headers,
162+
json={"config": {"exclude": "["}},
163+
)
164+
assert r.status_code == 400
165+
assert r.json()["detail"] == "invalid_regex"
166+
167+
r = requests.patch(
168+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
169+
headers=crawler_auth_headers,
170+
json={"config": {"exclude": ["abc.*", "["]}},
171+
)
172+
assert r.status_code == 400
173+
assert r.json()["detail"] == "invalid_regex"
174+
175+
156176
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
157177
r = requests.patch(
158178
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
@@ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id):
487507
for crawler_channel in crawler_channels:
488508
assert crawler_channel["id"]
489509
assert crawler_channel["image"]
510+
511+
512+
def test_add_crawl_config_invalid_exclude_regex(
513+
crawler_auth_headers, default_org_id, sample_crawl_data
514+
):
515+
sample_crawl_data["config"]["exclude"] = "["
516+
r = requests.post(
517+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
518+
headers=crawler_auth_headers,
519+
json=sample_crawl_data,
520+
)
521+
assert r.status_code == 400
522+
assert r.json()["detail"] == "invalid_regex"
523+
524+
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
525+
r = requests.post(
526+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
527+
headers=crawler_auth_headers,
528+
json=sample_crawl_data,
529+
)
530+
assert r.status_code == 400
531+
assert r.json()["detail"] == "invalid_regex"

backend/test/test_run_crawl.py

+9
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id):
148148
assert r.json()["success"] == True
149149

150150

151+
def test_add_invalid_exclusion(admin_auth_headers, default_org_id):
152+
r = requests.post(
153+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[",
154+
headers=admin_auth_headers,
155+
)
156+
assert r.status_code == 400
157+
assert r.json()["detail"] == "invalid_regex"
158+
159+
151160
def test_remove_exclusion(admin_auth_headers, default_org_id):
152161
r = requests.delete(
153162
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",

0 commit comments

Comments
 (0)