CL-Project2/scraping.py at main · sentient-bread/CL-Project2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from config import *
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os
import re


def clean_text(text):
    # takes text and gets rid of the english stuff from it
    return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text)
    return_text = re.sub('\n', '', return_text)
    return return_text

def scrape_page(url, corpus_file):
    # takes corpus_file object
    # takes url
    # scrapes url and writes to corpus_file
    # corpus_file is either in write mode or readmode
    # that is an assumption

    response = requests.get(url)
    if response.status_code == 404:
        raise ValueError("Invalid url provided")
    soup_for_page = BeautifulSoup(response.content, 'html.parser')

    page_title = soup_for_page.find_all('title')[0]
    print(f"Scraping {page_title.get_text()}")

    all_paragraphs = soup_for_page.find_all('p')
    # parsed_url = urllib.parse.urlparse(url)
    output_text = ""

    for para in all_paragraphs:
        for content in para.contents:
            # each content has a string attribute
            # which represents the text

            string = content.string
            if string is not None:
                output_text += string

    output_text_cleaned = clean_text(output_text)
    corpus_file.write(output_text_cleaned)
    return output_text_cleaned

def create_relevant_data_files(data_dir_path):

    try:
        os.mkdir(data_dir_path)
    except FileExistsError:
        # in case the data directory already exists
        pass


    file_path = os.path.join(data_dir_path, "corpus.txt")
    print("file path", file_path)

    try:
        f = open(file_path, "x")
        f.close()
    except FileExistsError:
        # file was already made
        pass

    return file_path

def sentence_tokenize(text):
    # simple tokenization using regex

    tokenized_list = re.split(SENTENCE_DELIMITER, text)
    # splits using the SENTENCE_DELIMITER
    # each delimiter is also turned into a token itself
    tokenized_list_final = []

    for index, token in enumerate(tokenized_list):
        if bool(re.match(SENTENCE_DELIMITER, token)):
            # this is to weed out the elements that are the delimiters themselves
            continue
        if index < len(tokenized_list)-1:
            if tokenized_list[index + 1] != "?":
                # we don't want question sentences

                tokenized_list_final.append(token)
    return tokenized_list_final