-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltkWrapper.py
109 lines (91 loc) · 2.67 KB
/
nltkWrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
'''
Author: Jacob Goldman
NLTK CorpusReader for the song lyrics corpus
'''
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class Song(object):
def __init__(self, title=None, stanzas=None):
self.title = title
if stanzas is None:
self.stanzas = []
else:
self.stanzas = lines
def lines(self):
lines = []
for stanza in self.stanzas:
for line in stanza.lines:
lines.append(line)
lines.append('\n')
return lines
def words(self):
words = []
for line in self.lines():
words += WordPunctTokenizer().tokenize(line)
return words
def add_stanzas(self, stanza):
self.stanzas.append(stanza)
def __repr__(self):
return 'Song(title="{}", stanzas={})'.format(self.title, self.stanzas)
class Stanza(object):
def __init__(self, lines):
self.lines = lines
# here is where rhyme annotations would go
def __repr__(self):
return('Stanza(lines={})').format(self.lines)
class SongCorpusReader(CorpusReader):
CorpusView = StreamBackedCorpusView
def __init__(self, root=None, fileids=None, word_tokenizer=WordPunctTokenizer(), encoding='utf8'):
if root is None:
root = './data'
if fileids is None:
fileids = os.listdir('./data')
CorpusReader.__init__(self,root,fileids,encoding)
self._word_tokenizer = word_tokenizer
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
elif isinstance(fileids, string_types): fileids = [fileids]
raw_texts = []
for f in fileids:
_fin = self.open(f)
raw_texts.append(_fin.read())
_fin.close()
return concat(raw_texts)
def words(self, fileids=None):
return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid)
in self.abspaths(fileids, True, True)])
def songs(self, fileids=None):
return concat([self.CorpusView(path, self._read_song_block, encoding=enc)
for (path, enc, fileid)
in self.abspaths(fileids, True, True)])
def _read_song_block(self,stream):
songs = []
song = Song()
while True:
song.add_stanzas(self._read_stanza_block(stream))
oldpos = stream.tell()
line = stream.readline()
if line=='<SONG_BOUNDARY>\n':
songs.append(song)
song = Song()
else:
stream.seek(oldpos)
if not line:
return songs
def _read_stanza_block(self,stream):
lines = []
while True:
line = stream.readline()
if line != '\n':
if line: line = line[:len(line)-1]
lines.append(line)
if line == '\n' or not line:
return Stanza(lines)
def _read_word_block(self, stream):
words = []
for i in range(20):
line = stream.readline()
if line != '<SONG_BOUNDARY>' and line != '<SONG_BOUNDARY>\n':
words.extend(self._word_tokenizer.tokenize(line))
return words