|
| 1 | +#!/usr/bin/env perl |
| 2 | + |
| 3 | +# Copyright 2014 University of Edinburgh (Author: Pawel Swietojanski) |
| 4 | + |
| 5 | +# The script - based on punctuation times - splits segments longer than #words (input parameter) |
| 6 | +# and produces bit more more normalised form of transcripts, as follows |
| 7 | +# MeetID Channel Spkr stime etime transcripts |
| 8 | + |
| 9 | +#use List::MoreUtils 'indexes'; |
| 10 | +use strict; |
| 11 | +use warnings; |
| 12 | + |
| 13 | +sub split_transcripts; |
| 14 | +sub normalise_transcripts; |
| 15 | + |
| 16 | +sub merge_hashes { |
| 17 | + my ($h1, $h2) = @_; |
| 18 | + my %hash1 = %$h1; my %hash2 = %$h2; |
| 19 | + foreach my $key2 ( keys %hash2 ) { |
| 20 | + if( exists $hash1{$key2} ) { |
| 21 | + warn "Key [$key2] is in both hashes!"; |
| 22 | + next; |
| 23 | + } else { |
| 24 | + $hash1{$key2} = $hash2{$key2}; |
| 25 | + } |
| 26 | + } |
| 27 | + return %hash1; |
| 28 | +} |
| 29 | + |
| 30 | +sub print_hash { |
| 31 | + my ($h) = @_; |
| 32 | + my %hash = %$h; |
| 33 | + foreach my $k (sort keys %hash) { |
| 34 | + print "$k : $hash{$k}\n"; |
| 35 | + } |
| 36 | +} |
| 37 | + |
| 38 | +sub get_name { |
| 39 | + #no warnings; |
| 40 | + my $sname = sprintf("%07d_%07d", $_[0]*100, $_[1]*100) || die 'Input undefined!'; |
| 41 | + #use warnings; |
| 42 | + return $sname; |
| 43 | +} |
| 44 | + |
| 45 | +sub split_on_comma { |
| 46 | + |
| 47 | + my ($text, $comma_times, $btime, $etime, $max_words_per_seg)= @_; |
| 48 | + my %comma_hash = %$comma_times; |
| 49 | + |
| 50 | + print "Btime, Etime : $btime, $etime\n"; |
| 51 | + |
| 52 | + my $stime = ($etime+$btime)/2; #split time |
| 53 | + my $skey = ""; |
| 54 | + my $otime = $btime; |
| 55 | + foreach my $k (sort {$comma_hash{$a} cmp $comma_hash{$b} } keys %comma_hash) { |
| 56 | + print "Key : $k : $comma_hash{$k}\n"; |
| 57 | + my $ktime = $comma_hash{$k}; |
| 58 | + if ($ktime==$btime) { next; } |
| 59 | + if ($ktime==$etime) { last; } |
| 60 | + if (abs($stime-$ktime)/2<abs($stime-$otime)/2) { |
| 61 | + $otime = $ktime; |
| 62 | + $skey = $k; |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + my %transcripts = (); |
| 67 | + |
| 68 | + if (!($skey =~ /[\,][0-9]+/)) { |
| 69 | + print "Cannot split into less than $max_words_per_seg words! Leaving : $text\n"; |
| 70 | + $transcripts{get_name($btime, $etime)}=$text; |
| 71 | + return %transcripts; |
| 72 | + } |
| 73 | + |
| 74 | + print "Splitting $text on $skey at time $otime (stime is $stime)\n"; |
| 75 | + my @utts1 = split(/$skey\s+/, $text); |
| 76 | + for (my $i=0; $i<=$#utts1; $i++) { |
| 77 | + my $st = $btime; |
| 78 | + my $et = $comma_hash{$skey}; |
| 79 | + if ($i>0) { |
| 80 | + $st=$comma_hash{$skey}; |
| 81 | + $et = $etime; |
| 82 | + } |
| 83 | + my (@utts) = split (' ', $utts1[$i]); |
| 84 | + if ($#utts < $max_words_per_seg) { |
| 85 | + my $nm = get_name($st, $et); |
| 86 | + print "SplittedOnComma[$i]: $nm : $utts1[$i]\n"; |
| 87 | + $transcripts{$nm} = $utts1[$i]; |
| 88 | + } else { |
| 89 | + print 'Continue splitting!'; |
| 90 | + my %transcripts2 = split_on_comma($utts1[$i], \%comma_hash, $st, $et, $max_words_per_seg); |
| 91 | + %transcripts = merge_hashes(\%transcripts, \%transcripts2); |
| 92 | + } |
| 93 | + } |
| 94 | + return %transcripts; |
| 95 | +} |
| 96 | + |
| 97 | +sub split_transcripts { |
| 98 | + @_ == 4 || die 'split_transcripts: transcript btime etime max_word_per_seg'; |
| 99 | + |
| 100 | + my ($text, $btime, $etime, $max_words_per_seg) = @_; |
| 101 | + my (@transcript) = @$text; |
| 102 | + |
| 103 | + my (@punct_indices) = grep { $transcript[$_] =~ /^[\.,\?\!\:]$/ } 0..$#transcript; |
| 104 | + my (@time_indices) = grep { $transcript[$_] =~ /^[0-9]+\.[0-9]*/ } 0..$#transcript; |
| 105 | + my (@puncts_times) = delete @transcript[@time_indices]; |
| 106 | + my (@puncts) = @transcript[@punct_indices]; |
| 107 | + |
| 108 | + if ($#puncts_times != $#puncts) { |
| 109 | + print 'Ooops, different number of punctuation signs and timestamps! Skipping.'; |
| 110 | + return (); |
| 111 | + } |
| 112 | + |
| 113 | + #first split on full stops |
| 114 | + my (@full_stop_indices) = grep { $puncts[$_] =~ /[\.\?]/ } 0..$#puncts; |
| 115 | + my (@full_stop_times) = @puncts_times[@full_stop_indices]; |
| 116 | + |
| 117 | + unshift (@full_stop_times, $btime); |
| 118 | + push (@full_stop_times, $etime); |
| 119 | + |
| 120 | + my %comma_puncts = (); |
| 121 | + for (my $i=0, my $j=0;$i<=$#punct_indices; $i++) { |
| 122 | + my $lbl = "$transcript[$punct_indices[$i]]$j"; |
| 123 | + if ($lbl =~ /[\.\?].+/) { next; } |
| 124 | + $transcript[$punct_indices[$i]] = $lbl; |
| 125 | + $comma_puncts{$lbl} = $puncts_times[$i]; |
| 126 | + $j++; |
| 127 | + } |
| 128 | + |
| 129 | + #print_hash(\%comma_puncts); |
| 130 | + |
| 131 | + print "InpTrans : @transcript\n"; |
| 132 | + print "Full stops: @full_stop_times\n"; |
| 133 | + |
| 134 | + my @utts1 = split (/[\.\?]/, uc join(' ', @transcript)); |
| 135 | + my %transcripts = (); |
| 136 | + for (my $i=0; $i<=$#utts1; $i++) { |
| 137 | + my (@utts) = split (' ', $utts1[$i]); |
| 138 | + if ($#utts < $max_words_per_seg) { |
| 139 | + print "ReadyTrans: $utts1[$i]\n"; |
| 140 | + $transcripts{get_name($full_stop_times[$i], $full_stop_times[$i+1])} = $utts1[$i]; |
| 141 | + } else { |
| 142 | + print "TransToSplit: $utts1[$i]\n"; |
| 143 | + my %transcripts2 = split_on_comma($utts1[$i], \%comma_puncts, $full_stop_times[$i], $full_stop_times[$i+1], $max_words_per_seg); |
| 144 | + print "Hash TR2:\n"; print_hash(\%transcripts2); |
| 145 | + print "Hash TR:\n"; print_hash(\%transcripts); |
| 146 | + %transcripts = merge_hashes(\%transcripts, \%transcripts2); |
| 147 | + print "Hash TR_NEW : \n"; print_hash(\%transcripts); |
| 148 | + } |
| 149 | + } |
| 150 | + return %transcripts; |
| 151 | +} |
| 152 | + |
| 153 | +sub normalise_transcripts { |
| 154 | + my $text = $_[0]; |
| 155 | + |
| 156 | + #DO SOME ROUGH AND OBVIOUS PRELIMINARY NORMALISATION, AS FOLLOWS |
| 157 | + #remove the remaining punctation labels e.g. some text ,0 some text ,1 |
| 158 | + $text =~ s/[\.\,\?\!\:][0-9]+//g; |
| 159 | + #there are some extra spurious puncations without spaces, e.g. UM,I, replace with space |
| 160 | + $text =~ s/[A-Z']+,[A-Z']+/ /g; |
| 161 | + #split words combination, ie. ANTI-TRUST to ANTI TRUST (None of them appears in cmudict anyway) |
| 162 | + #$text =~ s/(.*)([A-Z])\s+(\-)(.*)/$1$2$3$4/g; |
| 163 | + $text =~ s/\-/ /g; |
| 164 | + #substitute X_M_L with X. M. L. etc. |
| 165 | + $text =~ s/\_/. /g; |
| 166 | + #normalise and trim spaces |
| 167 | + $text =~ s/^\s*//g; |
| 168 | + $text =~ s/\s*$//g; |
| 169 | + $text =~ s/\s+/ /g; |
| 170 | + #some transcripts are empty with -, nullify (and ignore) them |
| 171 | + $text =~ s/^\-$//g; |
| 172 | + $text =~ s/\s+\-$//; |
| 173 | + # apply few exception for dashed phrases, Mm-Hmm, Uh-Huh, etc. those are frequent in AMI |
| 174 | + # and will be added to dictionary |
| 175 | + $text =~ s/MM HMM/MM\-HMM/g; |
| 176 | + $text =~ s/UH HUH/UH\-HUH/g; |
| 177 | + |
| 178 | + return $text; |
| 179 | +} |
| 180 | + |
| 181 | +if (@ARGV != 2) { |
| 182 | + print STDERR "Usage: ami_split_segments.pl <meet-file> <out-file>\n"; |
| 183 | + exit(1); |
| 184 | +} |
| 185 | + |
| 186 | +my $meet_file = shift @ARGV; |
| 187 | +my $out_file = shift @ARGV; |
| 188 | +my %transcripts = (); |
| 189 | + |
| 190 | +open(W, ">$out_file") || die "opening output file $out_file"; |
| 191 | +open(S, "<$meet_file") || die "opening meeting file $meet_file"; |
| 192 | + |
| 193 | +while(<S>) { |
| 194 | + |
| 195 | + my @A = split(" ", $_); |
| 196 | + if (@A < 9) { print "Skipping line @A"; next; } |
| 197 | + |
| 198 | + my ($meet_id, $channel, $spk, $channel2, $trans_btime, $trans_etime, $aut_btime, $aut_etime) = @A[0..7]; |
| 199 | + my @transcript = @A[8..$#A]; |
| 200 | + my %transcript = split_transcripts(\@transcript, $aut_btime, $aut_etime, 30); |
| 201 | + |
| 202 | + for my $key (keys %transcript) { |
| 203 | + my $value = $transcript{$key}; |
| 204 | + my $segment = normalise_transcripts($value); |
| 205 | + my @times = split(/\_/, $key); |
| 206 | + if ($times[0] >= $times[1]) { |
| 207 | + print "Warning, $meet_id, $spk, $times[0] > $times[1]. Skipping. \n"; next; |
| 208 | + } |
| 209 | + if (length($segment)>0) { |
| 210 | + print W join " ", $meet_id, "H0${channel2}", $spk, $times[0]/100.0, $times[1]/100.0, $segment, "\n"; |
| 211 | + } |
| 212 | + } |
| 213 | + |
| 214 | +} |
| 215 | +close(S); |
| 216 | +close(W); |
| 217 | + |
| 218 | +print STDERR "Finished." |
0 commit comments