IRS-LSA/preprocessing.py at main · MrHope03/IRS-LSA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import nltk
import json
from nltk.corpus import stopwords
from nltk import ngrams
from nltk import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
import pickle
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

# Download all the required files
nltk.download('stopwords')  # downloads stopword
nltk.download('wordnet')  # downloads wordnet
nltk.download('averaged_perceptron_tagger')


lemmatizer = WordNetLemmatizer()
tokenizer = ToktokTokenizer()

stopword_list = nltk.corpus.stopwords.words('english')


def load_articles():
    articles = []
    try:
        with open("./articles/indianExp.json", encoding='utf-8') as f:
            data = json.load(f)
            articles += data
    except json.decoder.JSONDecodeError as e:
        data = []
    return articles


articles = load_articles()


def remove_punctuations(text):
    """Removes punctuation from text"""
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)
    return text


def decontact(phrase):
    """Removes apostrophe word and numbers"""
    # number
    phrase = re.sub(r'\b\d+\b', '', phrase)
    phrase = re.sub(r'\’', '\'', phrase)

    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def capture_lemmatization(tokens):
    """Captures lemmatization and translates word accordingly"""
    tokens = [lemmatizer.lemmatize(
        token, get_wordnet_pos(token)) for token in tokens]
    return tokens


def generate_tokens(text):
    """Generates tokens using tokenizer"""
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    return tokens


def query_processing(query):

    vectorizer = pickle.load(open('./models/vectorizer.pickle', 'rb'))

    query_tokens = generate_tokens(query)
    query_tokens = capture_lemmatization(query_tokens)
    query_res = ' '.join(query_tokens)
    query_vec = vectorizer.transform([query_res])

    tfidf = pickle.load(open('./models/tfidf_matrix.pickle', 'rb'))

    sims = cosine_similarity(query_vec, tfidf)
    sims = [(i, sim) for i, sim in enumerate(sims[0])]
    sims = sorted(sims, key=lambda item: -item[1])

    return sims