-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkuaishou_video_download.py
110 lines (87 loc) · 4.27 KB
/
kuaishou_video_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from execute_database import insert_into_database
from bs4 import BeautifulSoup
def extract_videos_info(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
videos_data = []
# 查找所有视频的 div
videos = soup.find_all('div', class_='video-card video-item vertical')
for video in videos:
# 从海报图片的 'alt' 属性中提取标题
title = video.find('div', class_='poster').find('img').get('alt', '无标题')
# 从 'info-text' span 中提取喜欢数
likes_text = video.find('span', class_='info-text').text if video.find('span', class_='info-text') else '未找到'
# 假设格式总是 "[数字]喜欢",将喜欢数文本转换为整数
likes = int(''.join(filter(str.isdigit, likes_text)))
video_info = {
'title': title,
'user_like': likes,
}
videos_data.append(video_info)
return videos_data
def scrape_videos_and_store_info(url):
options = Options()
options.add_argument('--disable-gpu')
options.add_argument("window-size=1024,768")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver_path = 'D:\\Program Files\\chromedriver-win64\\chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=options)
videos_info = []
try:
driver.get(url)
time.sleep(5) # 为页面加载提供足够的时间
# 滚动页面以加载所有视频
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # 等待新视频加载
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
soup = BeautifulSoup(driver.page_source, 'html.parser')
videos_info = []
try:
videos = soup.find_all('div',
class_='video-card video-item vertical') # Adjusted to find divs with the correct class
for video in videos:
# Assuming the title might be in an 'img' tag's 'alt' attribute inside a 'div' with the class 'poster'
title = video.find('div', class_='poster').find('img').get('alt', 'No title')
# Find the 'span' containing likes
user_like_element = video.find('div', class_='video-info no-jump-avatar').find('span',
class_='info-text')
user_like = user_like_element.text if user_like_element else '0 likes'
# Assuming the href might not be directly available, but you might need to extract it from some other element
href = 'No link' # Update this part based on where the link can be found in your HTML structure
videos_info.append({
'title': title,
'user_like': user_like,
'href': href
})
except Exception as e:
print(f"An error occurred: {e}")
finally:
driver.quit() # Ensure the driver quits after scraping
return videos_info
if __name__ == "__main__":
table_name = 'videos'
database = 'video_info.db'
url = 'https://www.kuaishou.com/profile/3xa8g6xcb3h8x2c'
html_file_path = r'C:\Users\13636\Downloads\AI北极熊(收徒)-快手.html'
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# 假设 `html_content` 包含您提供的HTML源代码
videos_info = extract_videos_info(html_content)
print(videos_info)
# 打印视频数据列表
fields = ['title', 'user_like']
for video in videos_info:
values = [video['title'], video['user_like']]
insert_into_database(database, table_name, fields, values)
# videos_info = scrape_videos_and_store_info(url)
# store_video_info_to_db(videos_info, database, 'kuaishou')