-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathapp.py
92 lines (75 loc) · 3.27 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit.components.v1 as components
from datetime import datetime
import streamlit as st
from bs4 import BeautifulSoup
import requests
import io
## ............................................... ##
# Set page configuration (Call this once and make changes as needed)
st.set_page_config(page_title='HTML-Content-Extractor', layout='wide', page_icon=':spiral_note_pad:')
## ............................................... ##
# Instructions and information
st.sidebar.subheader("How to Use?")
st.sidebar.info("1. Enter the URL you want to extract HTML content from.")
st.sidebar.info("2. Click the 'Fetch and Process HTML' button.")
st.sidebar.info("3. The HTML content will be displayed.")
st.sidebar.info("4. Click the 'Download HTML' button to save the HTML file.")
## ............................................... ##
# Footer
st.sidebar.markdown(f"© {datetime.now().year} Bayhaqy")
## ............................................... ##
# Streamlit app title and description
with st.container():
# Define Streamlit app title and introduction
st.title("HTML Content Extractor")
st.write("Tools to extract HTML content from the Google web cache.")
## ............................................... ##
# Input field for the URL
url = st.text_input("Enter the URL:")
## ............................................... ##
# Function to remove script tag and text before <html>
def remove_script_and_before_html(html_content):
# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Find and remove the script tag
script_tag = soup.find("script", text="window.main();")
if script_tag:
script_tag.extract()
# Get the modified HTML content
html_content = str(soup)
# Find the position of the <html> tag
html_start = html_content.find("<html")
if html_start != -1:
# Extract everything starting from the <html> tag
html_content = html_content[html_start:]
return html_content
else:
return "No <html> tag found in the HTML content."
return html_content
## ............................................... ##
# Button to fetch and process the content
if st.button("Fetch and Process HTML"):
if url:
base_url = 'https://webcache.googleusercontent.com/search?q=cache:'
full_url = base_url + url + '&strip=0&vwsrc=0'
st.write(full_url)
# Send an HTTP GET request to the URL
response = requests.get(full_url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
html_content = response.text
modified_html_content = remove_script_and_before_html(html_content)
# Save the HTML content to a file
with st.expander("Show Medium Content"):
components.html(modified_html_content, width=1280, height=700, scrolling=True)
# Button to download the HTML file
download_button = st.download_button(
label="Download HTML",
data=modified_html_content,
key="download_html_button",
file_name="web_cache.html",
)
else:
st.error(f"Failed to retrieve the web page. Status code: {response.status_code}")
else:
st.warning("Please enter a URL.")