|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +def stemWord(w): |
| 4 | + """Renders a word in to its generic form. |
| 5 | + |
| 6 | + This function is used preprocess words for NLP. |
| 7 | + It removes all trailing punctuation marks "';-?.,!:". |
| 8 | + It also removes the possessive "'s" from words. |
| 9 | + And it converts it to all lower case |
| 10 | + |
| 11 | + Args: |
| 12 | + w (str): A string containing the input word |
| 13 | + |
| 14 | + Returns: |
| 15 | + str a simpler version of the word |
| 16 | + """ |
| 17 | + w=w.split("'")[0]#Separates and eliminates all after the "'" |
| 18 | + w=[letter for letter in w if not (letter in ';-?.,!:')] #Eliminates the simbols on w |
| 19 | + return ''.join(w).lower() #Joins and puts in lowercase |
| 20 | + |
| 21 | + |
| 22 | +class LanguageModel: |
| 23 | + """Class that contains the data needed for the analysis of text |
| 24 | + """ |
| 25 | + @staticmethod |
| 26 | + def getWordFreq(txt): |
| 27 | + """Returns a dictionary where the keys are stemmed words and values |
| 28 | + are the number they occured in txt. |
| 29 | + |
| 30 | + As na example if txt == "The dog likes the cat!" the resulting |
| 31 | + frequency dictionary should be {"the":2,"dog":1,"likes":1,"cat":1} |
| 32 | + |
| 33 | + Hints: |
| 34 | + -Use split to break the text in to a list of words |
| 35 | + stem all words in the list |
| 36 | + -Make sure a dictionary element exists before operating on it. |
| 37 | + |
| 38 | + Args: |
| 39 | + txt (str): a string containing free text |
| 40 | + |
| 41 | + Returns: |
| 42 | + dict: A dictionary whith stemmed words (str) for keys and int |
| 43 | + values containing the occurence count of these words. |
| 44 | + """ |
| 45 | + |
| 46 | + list1 = txt.split() |
| 47 | + newlist = [] |
| 48 | + dic = {} |
| 49 | + for word in list1: |
| 50 | + newlist.append(stemWord(word)) |
| 51 | + |
| 52 | + for word in newlist: |
| 53 | + if dic.has_key(word) == True: |
| 54 | + dic[word] = dic[word] +1 |
| 55 | + else: |
| 56 | + dic[word] = 1 |
| 57 | + return dic |
| 58 | + |
| 59 | + @staticmethod |
| 60 | + def mergeWordFreqDict(frDict1,frDict2): |
| 61 | + """Takes two dictionaries containing word frequencies and returns a |
| 62 | + single dictionary containing their sum. |
| 63 | + |
| 64 | + In essence this fuction takes the frequencies produced from two |
| 65 | + different strings with text and returns a dictionary with the |
| 66 | + word frequencies of the concatenation of these two strings. |
| 67 | + |
| 68 | + Hints: |
| 69 | + -Dictionary .keys() returns a list so you might need to cast it |
| 70 | + to a set, if you want to get the union. |
| 71 | + -It is better if you create a dictionary with 0 for all words in |
| 72 | + both dictionaries and then update the values |
| 73 | + -If frDict1=={"hello":1,"world":1} and frDict2=={"goodbye":1,"world":1}, |
| 74 | + the result must be {"hello":1,"world":2,"goodbye":1} |
| 75 | + -The simplest solution involves 3 non nested for loops. |
| 76 | + |
| 77 | + Args: |
| 78 | + frDict1 (dict): a dictionary with stemmed words as keys and |
| 79 | + positive integers as values. |
| 80 | + frDict2 (dict): a dictionary with stemmed words as keys and |
| 81 | + positive integers as values. |
| 82 | + |
| 83 | + Returns: |
| 84 | + dict: a dictionary with stemmed words as keys and positive |
| 85 | + integers as values. |
| 86 | + """ |
| 87 | + |
| 88 | + new_dict = frDict1 |
| 89 | + |
| 90 | + for i in frDict2: |
| 91 | + if i in new_dict: |
| 92 | + new_dict[i] += frDict2[i] |
| 93 | + else: |
| 94 | + new_dict[i] = frDict2[i] |
| 95 | + |
| 96 | + return new_dict |
| 97 | + |
| 98 | + def __init__(self,txtList=[]): |
| 99 | + """LangueModel constructor |
| 100 | + |
| 101 | + Initialises the class members to valid values. |
| 102 | + __texts is a list with one or more strings with texts. |
| 103 | + __wordFreq is a dictionary with stemmed words for keys and the |
| 104 | + count of the occurences of each word (int) as values. |
| 105 | + |
| 106 | + Args: |
| 107 | + txtList (list): A list of strings where each string will |
| 108 | + contains some text. |
| 109 | + """ |
| 110 | + self.__wordFreq={}#Dictionary with the word's frequencies. |
| 111 | + self.__texts=[]#A list with a text string |
| 112 | + if txtList.__class__ != [].__class__: |
| 113 | + raise Exception('txtList must be a list of strings') |
| 114 | + for txt in txtList: |
| 115 | + self.addText(txt)#add text to the list |
| 116 | + |
| 117 | + def addText(self,txt): |
| 118 | + """Adds a string containing text to the model |
| 119 | + |
| 120 | + This method just uses getWordFreq and mergeWordFreqDict static |
| 121 | + methods on a specific instance of the class |
| 122 | + |
| 123 | + Args: |
| 124 | + txt (str): the string containing text to be added |
| 125 | + """ |
| 126 | + self.__texts.append(txt) |
| 127 | + newFreq=LanguageModel.getWordFreq(txt) #newFreq is a dictionary with the word's frequencies of the added text |
| 128 | + self.__wordFreq=LanguageModel.mergeWordFreqDict(self.__wordFreq,newFreq) #Sum of the two dictionaries |
| 129 | + |
| 130 | + def addTextFile(self,fileName): |
| 131 | + """Ads text contained in a text-file |
| 132 | + |
| 133 | + Args: |
| 134 | + fileName (str): the absolute or relative path to a file |
| 135 | + containing text. |
| 136 | + """ |
| 137 | + self.addText(open(fileName).read())#Adds text in a file |
| 138 | + |
| 139 | + def wordCount(self): |
| 140 | + """Returns the total number of words found in self.__texts |
| 141 | + |
| 142 | + Hints: |
| 143 | + -The answer can be writen in a single line |
| 144 | + -The method values() of dict is the key to solving this question |
| 145 | + -The distionary __wordFreq contains how many times each word was |
| 146 | + found in the texts |
| 147 | +
|
| 148 | + Returns: |
| 149 | + int: the count of all the words |
| 150 | + """ |
| 151 | + |
| 152 | + return sum(self.__wordFreq.values()) |
| 153 | + |
| 154 | + |
| 155 | + def uniqueWordCount(self): |
| 156 | + """Returns the number of unique words found in self.__texts |
| 157 | +
|
| 158 | + Unique word means that a word occuring twice or more times, counts |
| 159 | + as one. |
| 160 | +
|
| 161 | + Hints: |
| 162 | + -The answer can be writen in a single line |
| 163 | + -The method keys() of dict is the key to solving this question |
| 164 | +
|
| 165 | + Returns: |
| 166 | + int: the count of unique words |
| 167 | + |
| 168 | + """ |
| 169 | + |
| 170 | + return len(self.__wordFreq.keys()) |
| 171 | + |
| 172 | + |
| 173 | + def getWordProbabillity(self,word): |
| 174 | + """Returns the probabillity of a word occuring according to the |
| 175 | + model |
| 176 | + |
| 177 | + The probabillity of a word occuring is the number of times it has |
| 178 | + occured divided by the count of all word occurences in __texts |
| 179 | + |
| 180 | + Args: |
| 181 | + word (str): an string with a word which is not necessarilly |
| 182 | + stemmed. |
| 183 | +
|
| 184 | + Returns: |
| 185 | + float: a float between 0 and 1 that contains the probabillity |
| 186 | + """ |
| 187 | + stemmedWord=stemWord(word)#Converts 'word' in a simple string in lowercase |
| 188 | + if stemmedWord in self.__wordFreq.keys():#if the word is in the list |
| 189 | + return self.__wordFreq[stemmedWord]/float(self.wordCount())#Divide between all the words to know the probabillity |
| 190 | + else: |
| 191 | + return 0#If not found return 0 |
| 192 | + |
| 193 | + def __str__(self): |
| 194 | + """Generate a string description of the Language Model |
| 195 | + |
| 196 | + Hints: |
| 197 | + -The result must be constructed with string concatenation |
| 198 | + -Cast an integer to a string before concatening it. |
| 199 | + -Use the already availabe methods to obtain information |
| 200 | + -lm=LanguageModel(['hello world','Goodbye World!']) |
| 201 | + lm.__str__() will return |
| 202 | + "LanguageModel\n\t#texts:2\n\t#words:4\n\t#unique words:3\n" |
| 203 | + -self.__texts, is a list containing all texts the LanguageModel has |
| 204 | + seen. |
| 205 | +
|
| 206 | + Returns: |
| 207 | + string: A description of the language model spanning 4 lines. |
| 208 | + """ |
| 209 | + uwords = self.uniqueWordCount(self) |
| 210 | + nwords = self.wordCount(self) |
| 211 | + texts = len(self.__texts(self)) |
| 212 | + |
| 213 | + lm = "The lenguage model has:\n\t" + "texts:" + str(texts) + "\n\t"+ "words:" + str(nwords) + "\n\t" + "unique words:" + str(uwords) + "\n" |
| 214 | + |
| 215 | + return lm |
| 216 | + |
| 217 | + def __repr__(self): |
| 218 | + """Generate a string description of the Language Model that allows |
| 219 | + to reconstruct it |
| 220 | + |
| 221 | + Returns: |
| 222 | + string: A python expression that invockes the constructor of the |
| 223 | + class so that if executed a deep copy of the LangueageModel is |
| 224 | + obtained. |
| 225 | + """ |
| 226 | + res=str(self.__class__)+'('+self.__texts.__repr__()+')' |
| 227 | + return res |
| 228 | + |
| 229 | + def getWordsByProbabillity(self): |
| 230 | + """Produces a list containing all stemmed words from the language |
| 231 | + model sorted from the most probable to the least probable |
| 232 | + |
| 233 | + Hints: |
| 234 | + -function reversed returns a list with reverse order of the input |
| 235 | + list |
| 236 | + -function sorted returns a list with the elements of the input sorted |
| 237 | + in ascending order. |
| 238 | + -A list of tuples is sorted by the first element of each tuple |
| 239 | +
|
| 240 | + Returns: |
| 241 | + list: a list of strings (not tuples!) |
| 242 | + """ |
| 243 | + finalist = {} |
| 244 | + newdict = self.__wordFreq.keys() |
| 245 | + for i in newdict: |
| 246 | + finalist[i] = self.getWordProbabillity(i) |
| 247 | + |
| 248 | + sorted_keys = sorted(finalist, key=finalist.__getitem__) |
| 249 | + |
| 250 | + l = list(reversed(sorted_keys)) |
| 251 | + |
| 252 | + return l |
| 253 | + |
| 254 | + def isPalindrome(sentence): |
| 255 | + """Tells us whether a string is a palindrome. |
| 256 | + |
| 257 | + Pallindromes are sentences whos characters read in both directions are |
| 258 | + the same. Testing for pallindromes ignores spaces and puntuation marks |
| 259 | + as if they did not exist. |
| 260 | + |
| 261 | + Hits: |
| 262 | + -A list can be indexed form the end with negative values. |
| 263 | + -The first character in a string is at position 0 |
| 264 | + If a=[1,"b",3,4] Then a[-1] is 4, a[-2] is 3, etc. |
| 265 | + -The expression a[len(a)-1]==a[-1] is always True if a is not empty |
| 266 | + -You will need to use .split() and .join methods of the str type |
| 267 | + |
| 268 | + Args: |
| 269 | + sentence (str): A string with one or more words assumed to have no |
| 270 | + possessive (stemWord can help). |
| 271 | + |
| 272 | + Returns: |
| 273 | + bool: The return value. True if the sentence was a palindrome, False |
| 274 | + otherwise. |
| 275 | + """ |
| 276 | + |
| 277 | + answer = True |
| 278 | + |
| 279 | + newstring = stemWord(sentence) |
| 280 | + |
| 281 | + s_no_space = newstring.replace(" ","")#remplaza el espacio |
| 282 | + |
| 283 | + if s_no_space == s_no_space[::-1]:#compara las dos listas |
| 284 | + answer = True#si son iguales es que son cap i cua |
| 285 | + else: |
| 286 | + answer = False |
| 287 | + |
| 288 | + return answer |
| 289 | + |
| 290 | + |
| 291 | + if __name__ == '__main__': |
| 292 | + #Everything here is ignored by joc-de-proves |
| 293 | + #You can debug your program by testing your functions and classes here |
| 294 | + pass |
0 commit comments