complete

PurposeAchiever6 · Aug 24, 2023 · ebdb00f · ebdb00f
commit ebdb00f
Show file tree

Hide file tree

Showing 76 changed files with 11,342 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+task1/Get_urls.py
+task2/.env
diff --git a/task1/--Input-Output.txt b/task1/--Input-Output.txt
@@ -0,0 +1,6 @@
+Input file
+	Object.txt
+	Subject.txt
+	Get_urls_deploy.py
+Output file
+	url.csv
diff --git a/task1/--Problem.txt b/task1/--Problem.txt
@@ -0,0 +1,14 @@
+Links google search
+-------------
+Input
+We are looking to get a compiled list of links related to adverse media of a specific person. We will provide a list of words (Subject) that will be linked to another list of words (Object).
+
+Deliverable
+We aim to compile a collection of unique links resulting from every conceivable keyword combination sourced from the following search parameters:
+Subject 1, Object 1,2,3,4,5….n
+Subject 2, Object 1,2,3,4,5…n
+Subject n, Object 1,2,3,4,5…n
+• 10 pages from Google.co.uk
+• 10 pages from Google.us
+• 5 pages from Bing.co.uk
+• 5 pages from Bing.us"
diff --git a/task1/--Solution.txt b/task1/--Solution.txt
@@ -0,0 +1,9 @@
+To achieve this, you would need to perform the following steps:
+1. Create a list of Subject keywords: Compile a list of words or phrases that describe the specific person you are interested in. These keywords will be used to search for relevant information.
+2. Create a list of Object keywords: Prepare another list of words or phrases that are related to the adverse media you are looking for. These keywords will be combined with the Subject keywords to generate search queries.
+3. Generate search queries: Combine each Subject keyword with every Object keyword to create search queries. For example, if you have three Subject keywords (Subject 1, Subject 2, Subject 3) and five Object keywords (Object 1, Object 2, Object 3, Object 4, Object 5), you will have a total of 15 search queries.
+4. Perform the search: Use a web scraping tool or a search engine API to search for each query on the specified search engines (Google.co.uk, Google.us, Bing.co.uk, Bing.us). Retrieve the search results for each query.
+5. Extract the links: From the search results, extract the URLs of the web pages. Remove any duplicate links to ensure a collection of unique links.
+6. Compile the final list: Combine all the unique links obtained from the search results into a single compiled list.
+7. Organize the list: Categorize the links based on the search engine and the search parameters used (e.g., Google.co.uk, Google.us, Bing.co.uk, Bing.us, Subject, Object).
+8. Limit the number of pages: Since you specified a certain number of pages to retrieve from each search engine, make sure to limit the number of links per search engine and per search parameter accordingly.
diff --git a/task1/Get_urls_deploy.py b/task1/Get_urls_deploy.py
@@ -0,0 +1,149 @@
+from seleniumbase import SB
+from time import sleep
+import csv
+
+Gmail_name = "Your_Gmail_address"
+Gmail_password = "Your_Gmail_password"
+
+remove_list = ["https://policies.google.co",
+               "https://support.google.co",
+               "https://www.google.co",
+               "https://maps.google.co",
+               "https://accounts.google.co",
+               "https://translate.google.co",
+               "https://myactivity.google.co",
+               "https://www.bing",
+               "http://help.bing",
+               "https://bing.co",
+               "http://go.microsoft.co",
+               "https://support.microsoft.co",
+               "http://help.bing.microsoft.co",
+               "https://go.microsoft.co",
+               "https://www.microsoft.co"]
+
+with open('Subject.txt', 'r') as file:
+    subject_contents = file.read()
+    subject_rows = subject_contents.split("\n")
+with open('Object.txt', 'r') as file:
+    object_contents = file.read()
+    object_rows = object_contents.split("\n")
+
+## 1. 10 pages from Google.co.uk
+def google_co_uk(subject, object, sb):
+    sb.open(f'https://www.google.co.uk/search?q={subject + " " + object}')
+    # sleep(5)
+    try:
+        sb.click('div.r, a[class="gb_ta gb_dd gb_Ed gb_de"]', by="css selector")
+    except Exception as e:
+        print("no exist sighin button")
+    # sleep(5)
+
+    page_count = 1
+    urls = []
+
+    while page_count <= 10:
+        search_results = sb.find_elements('div.r, a', by="css selector")
+        for result in search_results:
+           url = result.get_attribute('href')
+           urls.append(url)
+        try:
+            sb.click("a[id='pnnext']", by="css selector")
+        except Exception as e:
+            break
+        page_count += 1
+
+    # print(urls)
+    for url in urls:
+       if url is not None and url.startswith("http") and not any(url.startswith(item) for item in remove_list):
+        # print(url)
+        all_urls[url] = [subject, object, "https://www.google.co.uk"]
+
+## 2. 10 pages from Google.us
+def google_us(subject, object, sb):
+    sb.open(f'https://www.google.com/search?q={subject + " " + object}')
+    # sleep(5)
+    try:
+        sb.click('div.r, a[class="gb_ta gb_dd gb_Ed gb_de"]', by="css selector")
+    except Exception as e:
+        print("no exist sighin button")
+    # sleep(5)
+
+    page_count = 1
+    urls = []
+
+    while page_count <= 10:
+        search_results = sb.find_elements('div.r, a', by="css selector")
+        for result in search_results:
+           url = result.get_attribute('href')
+           urls.append(url)
+        try:
+            sb.click("a[id='pnnext']", by="css selector")
+        except Exception as e:
+            break
+        page_count += 1
+
+    # print(urls)
+    for url in urls:
+       if url is not None and url.startswith("http") and not any(url.startswith(item) for item in remove_list):
+        # print(url)
+        all_urls[url] = [subject, object, "https://www.google.us"]
+
+## 3. 5 pages from Bing.co.uk
+def bing_co_uk(subject, object, sb):
+    sb.open(f'https://www.bing.co.uk/search?q={subject + " " + object}')
+    # sleep(5)
+    # try:
+    #     sb.click('a[id="id_l"]', by="css selector")
+    # except Exception as e:
+    #     print("no exist sighin button")
+    # sleep(5)
+
+    page_count = 1
+    urls = []
+
+    while page_count <= 5:
+        search_results = sb.find_elements('div.r, a', by="css selector")
+        for result in search_results:
+           url = result.get_attribute('href')
+           urls.append(url)
+        try:
+            sb.click("a[class='sb_pagN sb_pagN_bp b_widePag sb_bp ' ]", by="css selector")
+        except Exception as e:
+            break
+        page_count += 1
+
+    # print(urls)
+    for url in urls:
+       if url is not None and url.startswith("http") and not any(url.startswith(item) for item in remove_list):
+        # print(url)
+        all_urls[url] = [subject, object, "https://www.bing.co.uk"]
+
+## 4. 5 pages from Bing.us
+def bing_us(subject, object, sb):
+    return(1)
+
+all_urls = {}
+# urls -> [subject, object, search_engine]
+
+with SB(uc=True) as sb:
+    sb.open("https://accounts.google.com/")
+    sb.type("//input[@name='identifier']", Gmail_name)
+    sb.click("//div[@id='identifierNext']")
+    sb.type('input[type="password"]', Gmail_password)
+    sb.click('button:contains("Next")')
+    # sleep(5)
+    for subject_row in subject_rows:
+        for object_row in object_rows:
+            google_co_uk(subject_row, object_row, sb)
+            google_us(subject_row, object_row, sb)
+            bing_co_uk(subject_row, object_row, sb)
+
+file_path = 'url.csv'
+all_urls_list = []
+for key, value in all_urls.items():
+    all_urls_list.append([key, value[0], value[1], value[2]])
+with open(file_path, 'w', newline='') as file:
+    # Create a CSV writer object
+    writer = csv.writer(file)
+    # Write the data to the CSV file
+    writer.writerows(all_urls_list)
diff --git a/task1/Object.txt b/task1/Object.txt
@@ -0,0 +1,12 @@
+Crime
+Triads
+Trial
+Conviction
+Arrest
+Offence
+Las Vegas
+Vegas
+San Marino
+Hong Kong
+FBI
+Justice
diff --git a/task1/Subject.txt b/task1/Subject.txt
@@ -0,0 +1,3 @@
+Wei Seng Phua
+Paul Phua
+Phua