Skip to content

Commit

Permalink
complete
Browse files Browse the repository at this point in the history
  • Loading branch information
versatile0118 committed Aug 24, 2023
0 parents commit ebdb00f
Show file tree
Hide file tree
Showing 76 changed files with 11,342 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
task1/Get_urls.py
task2/.env
6 changes: 6 additions & 0 deletions task1/--Input-Output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Input file
Object.txt
Subject.txt
Get_urls_deploy.py
Output file
url.csv
14 changes: 14 additions & 0 deletions task1/--Problem.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Links google search
-------------
Input
We are looking to get a compiled list of links related to adverse media of a specific person. We will provide a list of words (Subject) that will be linked to another list of words (Object).

Deliverable
We aim to compile a collection of unique links resulting from every conceivable keyword combination sourced from the following search parameters:
Subject 1, Object 1,2,3,4,5….n
Subject 2, Object 1,2,3,4,5…n
Subject n, Object 1,2,3,4,5…n
• 10 pages from Google.co.uk
• 10 pages from Google.us
• 5 pages from Bing.co.uk
• 5 pages from Bing.us"
9 changes: 9 additions & 0 deletions task1/--Solution.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
To achieve this, you would need to perform the following steps:
1. Create a list of Subject keywords: Compile a list of words or phrases that describe the specific person you are interested in. These keywords will be used to search for relevant information.
2. Create a list of Object keywords: Prepare another list of words or phrases that are related to the adverse media you are looking for. These keywords will be combined with the Subject keywords to generate search queries.
3. Generate search queries: Combine each Subject keyword with every Object keyword to create search queries. For example, if you have three Subject keywords (Subject 1, Subject 2, Subject 3) and five Object keywords (Object 1, Object 2, Object 3, Object 4, Object 5), you will have a total of 15 search queries.
4. Perform the search: Use a web scraping tool or a search engine API to search for each query on the specified search engines (Google.co.uk, Google.us, Bing.co.uk, Bing.us). Retrieve the search results for each query.
5. Extract the links: From the search results, extract the URLs of the web pages. Remove any duplicate links to ensure a collection of unique links.
6. Compile the final list: Combine all the unique links obtained from the search results into a single compiled list.
7. Organize the list: Categorize the links based on the search engine and the search parameters used (e.g., Google.co.uk, Google.us, Bing.co.uk, Bing.us, Subject, Object).
8. Limit the number of pages: Since you specified a certain number of pages to retrieve from each search engine, make sure to limit the number of links per search engine and per search parameter accordingly.
149 changes: 149 additions & 0 deletions task1/Get_urls_deploy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
from seleniumbase import SB
from time import sleep
import csv

Gmail_name = "Your_Gmail_address"
Gmail_password = "Your_Gmail_password"

remove_list = ["https://policies.google.co",
"https://support.google.co",
"https://www.google.co",
"https://maps.google.co",
"https://accounts.google.co",
"https://translate.google.co",
"https://myactivity.google.co",
"https://www.bing",
"http://help.bing",
"https://bing.co",
"http://go.microsoft.co",
"https://support.microsoft.co",
"http://help.bing.microsoft.co",
"https://go.microsoft.co",
"https://www.microsoft.co"]

with open('Subject.txt', 'r') as file:
subject_contents = file.read()
subject_rows = subject_contents.split("\n")
with open('Object.txt', 'r') as file:
object_contents = file.read()
object_rows = object_contents.split("\n")

## 1. 10 pages from Google.co.uk
def google_co_uk(subject, object, sb):
sb.open(f'https://www.google.co.uk/search?q={subject + " " + object}')
# sleep(5)
try:
sb.click('div.r, a[class="gb_ta gb_dd gb_Ed gb_de"]', by="css selector")
except Exception as e:
print("no exist sighin button")
# sleep(5)

page_count = 1
urls = []

while page_count <= 10:
search_results = sb.find_elements('div.r, a', by="css selector")
for result in search_results:
url = result.get_attribute('href')
urls.append(url)
try:
sb.click("a[id='pnnext']", by="css selector")
except Exception as e:
break
page_count += 1

# print(urls)
for url in urls:
if url is not None and url.startswith("http") and not any(url.startswith(item) for item in remove_list):
# print(url)
all_urls[url] = [subject, object, "https://www.google.co.uk"]

## 2. 10 pages from Google.us
def google_us(subject, object, sb):
sb.open(f'https://www.google.com/search?q={subject + " " + object}')
# sleep(5)
try:
sb.click('div.r, a[class="gb_ta gb_dd gb_Ed gb_de"]', by="css selector")
except Exception as e:
print("no exist sighin button")
# sleep(5)

page_count = 1
urls = []

while page_count <= 10:
search_results = sb.find_elements('div.r, a', by="css selector")
for result in search_results:
url = result.get_attribute('href')
urls.append(url)
try:
sb.click("a[id='pnnext']", by="css selector")
except Exception as e:
break
page_count += 1

# print(urls)
for url in urls:
if url is not None and url.startswith("http") and not any(url.startswith(item) for item in remove_list):
# print(url)
all_urls[url] = [subject, object, "https://www.google.us"]

## 3. 5 pages from Bing.co.uk
def bing_co_uk(subject, object, sb):
sb.open(f'https://www.bing.co.uk/search?q={subject + " " + object}')
# sleep(5)
# try:
# sb.click('a[id="id_l"]', by="css selector")
# except Exception as e:
# print("no exist sighin button")
# sleep(5)

page_count = 1
urls = []

while page_count <= 5:
search_results = sb.find_elements('div.r, a', by="css selector")
for result in search_results:
url = result.get_attribute('href')
urls.append(url)
try:
sb.click("a[class='sb_pagN sb_pagN_bp b_widePag sb_bp ' ]", by="css selector")
except Exception as e:
break
page_count += 1

# print(urls)
for url in urls:
if url is not None and url.startswith("http") and not any(url.startswith(item) for item in remove_list):
# print(url)
all_urls[url] = [subject, object, "https://www.bing.co.uk"]

## 4. 5 pages from Bing.us
def bing_us(subject, object, sb):
return(1)

all_urls = {}
# urls -> [subject, object, search_engine]

with SB(uc=True) as sb:
sb.open("https://accounts.google.com/")
sb.type("//input[@name='identifier']", Gmail_name)
sb.click("//div[@id='identifierNext']")
sb.type('input[type="password"]', Gmail_password)
sb.click('button:contains("Next")')
# sleep(5)
for subject_row in subject_rows:
for object_row in object_rows:
google_co_uk(subject_row, object_row, sb)
google_us(subject_row, object_row, sb)
bing_co_uk(subject_row, object_row, sb)

file_path = 'url.csv'
all_urls_list = []
for key, value in all_urls.items():
all_urls_list.append([key, value[0], value[1], value[2]])
with open(file_path, 'w', newline='') as file:
# Create a CSV writer object
writer = csv.writer(file)
# Write the data to the CSV file
writer.writerows(all_urls_list)
12 changes: 12 additions & 0 deletions task1/Object.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Crime
Triads
Trial
Conviction
Arrest
Offence
Las Vegas
Vegas
San Marino
Hong Kong
FBI
Justice
3 changes: 3 additions & 0 deletions task1/Subject.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Wei Seng Phua
Paul Phua
Phua
Loading

0 comments on commit ebdb00f

Please sign in to comment.