-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpracto_integration.py
85 lines (71 loc) · 4.23 KB
/
practo_integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import requests
from bs4 import BeautifulSoup
import json
def scrape_doctors_data_with_urls(url):
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# Find all main divs with the specified class
main_divs = soup.find_all('div', class_="u-border-general--bottom")
# List to store extracted data
doctors_data = []
for div in main_divs:
# Extract data using attribute selectors
doctor = {
"doctor_name": div.find(class_="doctor-name").text.strip() if div.find(class_="doctor-name") else None,
"experience": div.find(attrs={"data-qa-id": "doctor_experience"}).text.strip() if div.find(attrs={"data-qa-id": "doctor_experience"}) else None,
"locality": div.find(attrs={"data-qa-id": "practice_locality"}).text.strip() if div.find(attrs={"data-qa-id": "practice_locality"}) else None,
"city": div.find(attrs={"data-qa-id": "practice_city"}).text.strip() if div.find(attrs={"data-qa-id": "practice_city"}) else None,
"clinic_name": div.find(attrs={"data-qa-id": "doctor_clinic_name"}).text.strip() if div.find(attrs={"data-qa-id": "doctor_clinic_name"}) else None,
"clinic_count": div.find(attrs={"data-qa-id": "doctor_clinic_count"}).text.strip() if div.find(attrs={"data-qa-id": "doctor_clinic_count"}) else None,
"consultation_fee": div.find(attrs={"data-qa-id": "consultation_fee"}).text.strip() if div.find(attrs={"data-qa-id": "consultation_fee"}) else None,
"recommendation": div.find(attrs={"data-qa-id": "doctor_recommendation"}).text.strip() if div.find(attrs={"data-qa-id": "doctor_recommendation"}) else None,
"total_feedback": div.find(attrs={"data-qa-id": "total_feedback"}).text.strip() if div.find(attrs={"data-qa-id": "total_feedback"}) else None,
"availability_text": div.find(attrs={"data-qa-id": "availability_text"}).text.strip() if div.find(attrs={"data-qa-id": "availability_text"}) else None
}
# Extract URLs from the script tag
script_tag = div.find('script', type="application/ld+json")
if script_tag:
try:
# Load the JSON content
script_json = json.loads(script_tag.string)
urls = []
# Collect 'url' fields
# if "url" in script_json:
# doctor['profile'] = script_json["url"]
# # urls.append(script_json["url"])
# Collect additional URLs from "photo" or other nested structures
if "photo" in script_json:
for photo in script_json["photo"]:
if "url" in photo:
urls.append(photo["url"])
doctor["urls"] = urls[:11]
except json.JSONDecodeError:
doctor["urls"] = []
doctors_data.append(doctor)
# Return the list of extracted data
return doctors_data
except requests.exceptions.RequestException as e:
print(f"Error while fetching the URL: {e}")
return None
# Example usage
# if __name__ == "__main__":
# url = '''https://www.practo.com/search/doctors?results_type=doctor&q=[{"word":"neurologist","autocompleted":true,"category":"subspeciality"}]&city=Bangalore&filters[doctor_review_count]=20,9999999&filters[years_of_experience]=15,9999999'''
# data = scrape_doctors_data_with_urls(url)
# print(data)
# if data:
# for idx, doctor in enumerate(data, 1):
# print(f"Doctor {idx}:")
# for key, value in doctor.items():
# if key == "urls":
# print(f" {key}:")
# for url in value:
# print(f" - {url}")
# else:
# print(f" {key}: {value}")
# print("\n")
# else:
# print("No data found.")