-
Notifications
You must be signed in to change notification settings - Fork 113
/
Copy pathscrape.py
100 lines (76 loc) · 2.94 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import asyncio
import pprint
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
def remove_unwanted_tags(html_content, unwanted_tags=["script", "style"]):
"""
This removes unwanted HTML tags from the given HTML content.
"""
soup = BeautifulSoup(html_content, 'html.parser')
for tag in unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
return str(soup)
def extract_tags(html_content, tags: list[str]):
"""
This takes in HTML content and a list of tags, and returns a string
containing the text content of all elements with those tags, along with their href attribute if the
tag is an "a" tag.
"""
soup = BeautifulSoup(html_content, 'html.parser')
text_parts = []
for tag in tags:
elements = soup.find_all(tag)
for element in elements:
# If the tag is a link (a tag), append its href as well
if tag == "a":
href = element.get('href')
if href:
text_parts.append(f"{element.get_text()} ({href})")
else:
text_parts.append(element.get_text())
else:
text_parts.append(element.get_text())
return ' '.join(text_parts)
def remove_unessesary_lines(content):
# Split content into lines
lines = content.split("\n")
# Strip whitespace for each line
stripped_lines = [line.strip() for line in lines]
# Filter out empty lines
non_empty_lines = [line for line in stripped_lines if line]
# Remove duplicated lines (while preserving order)
seen = set()
deduped_lines = [line for line in non_empty_lines if not (
line in seen or seen.add(line))]
# Join the cleaned lines without any separators (remove newlines)
cleaned_content = "".join(deduped_lines)
return cleaned_content
async def ascrape_playwright(url, tags: list[str] = ["h1", "h2", "h3", "span"]) -> str:
"""
An asynchronous Python function that uses Playwright to scrape
content from a given URL, extracting specified HTML tags and removing unwanted tags and unnecessary
lines.
"""
print("Started scraping...")
results = ""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
try:
page = await browser.new_page()
await page.goto(url)
page_source = await page.content()
results = remove_unessesary_lines(extract_tags(remove_unwanted_tags(
page_source), tags))
print("Content scraped")
except Exception as e:
results = f"Error: {e}"
await browser.close()
return results
# TESTING
if __name__ == "__main__":
url = "https://www.patagonia.ca/shop/new-arrivals"
async def scrape_playwright():
results = await ascrape_playwright(url)
print(results)
pprint.pprint(asyncio.run(scrape_playwright()))