-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathviterbi.py
More file actions
executable file
·93 lines (75 loc) · 3.42 KB
/
viterbi.py
File metadata and controls
executable file
·93 lines (75 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
# algorithm adapted from:
# - https://ben.bolte.cc/viterbi
# With data from:
# - Speech and Language Processing
# - Dan Jurafsky and James H. Martin
# - Chapter 8
# - copyright 2021
import numpy as np
from typing import List, Tuple
class States:
def __init__(self, filename):
self.filename = filename
with open(filename) as f:
state_lines = [line.replace('\n', '').split(',') for line in f.readlines()]
self.state_lookup = dict((l.upper(), i) for i, l in enumerate(state_lines.pop(0)[1:]))
self.reverse_lookup = dict((v, k) for k, v in self.state_lookup.items())
self.state_probabilities = dict((tr[0], [float(t) for t in tr[1:]]) for tr in state_lines)
self.matrix = np.array([v for v in self.state_probabilities.values()])
def __getitem__(self, key):
return self.state_probabilities[key]
def states_for_pos(self, current, last):
if type(current) is str:
current = current.upper()
current_idx = self.state_lookup[current]
else:
current_idx = current
last = last.upper()
return self.state_probabilities[last][current_idx]
def col(self, col):
if type(col) is str:
col = self.state_lookup[col]
return self.matrix[:, col]
class Viterbi:
def __init__(self):
self.transitions = States('transitions.csv')
self.emissions = States('emissions.csv')
def time_step(self, observed_state, mu = None):
if mu is None:
emission = self.emissions.col(observed_state)
initial = self.transitions['<start>']
mu = initial * emission
max_idx = None
else:
state_probabilities = mu * self.transitions.matrix[1:].T
max_idx = np.argmax(state_probabilities, axis=1)
max_values = state_probabilities[np.arange(len(max_idx)), max_idx] # get the max values
mu = max_values * self.emissions.col(observed_state) # multiply max values by emission probabilities
return mu, max_idx
def trace_backwards(self, mu, state_matrix):
state = np.argmax(mu) # start at right-hand side max value
sequence_prob = mu[state]
state_sequence = [self.transitions.reverse_lookup[state]]
for states in state_matrix[::-1]: # reverse iterate
state = states[state]
state_sequence.append(self.transitions.reverse_lookup[state])
return state_sequence[::-1], sequence_prob
def run(self, string: str) -> Tuple[List[str], float]:
"""Runs the Viterbi algorithm to get the most likely state sequence.
:param string: the string with observations to parse for parts-of-speech
:returns: tuple of (most likely series of states, joint probability of that series)
"""
observed_states = [s.upper() for s in string.split()]
mu, _ = self.time_step(observed_states.pop(0))
state_matrix = []
# runs the forward pass, storing the most likely previous state at each step
for observed_state in observed_states:
mu, max_idx = self.time_step(observed_state, mu)
state_matrix.append(max_idx)
# Traces backwards to get the maximum likelihood sequence.
return self.trace_backwards(mu, state_matrix)
model = Viterbi()
path, prob = model.run('Janet will back the bill')
print(f' Max Sequence: {path}')
print(f'Seq Probability: {prob}')