@@ -89,6 +89,12 @@ private class VadState {
8989 // 音频分析
9090 private float avgEnergy = 0 ;
9191 private final List <Float > probs = new ArrayList <>();
92+
93+ // 添加原始VAD概率列表
94+ private final List <Float > originalProbs = new ArrayList <>();
95+
96+ // 帧计数器(用于每10帧输出一次)
97+ private int frameCounter = 0 ;
9298
9399 // 预缓冲
94100 private final LinkedList <byte []> preBuffer = new LinkedList <>();
@@ -166,6 +172,21 @@ public void addProb(float prob) {
166172 updateDeviceType ();
167173 }
168174 }
175+
176+ // 添加原始VAD概率
177+ public void addOriginalProb (float prob ) {
178+ originalProbs .add (prob );
179+ if (originalProbs .size () > 10 ) {
180+ originalProbs .remove (0 );
181+ }
182+
183+ // 增加帧计数器
184+ frameCounter ++;
185+ }
186+
187+ public float getLastOriginalProb () {
188+ return originalProbs .isEmpty () ? 0.0f : originalProbs .get (originalProbs .size () - 1 );
189+ }
169190
170191 public float getLastProb () {
171192 return probs .isEmpty () ? 0.0f : probs .get (probs .size () - 1 );
@@ -174,6 +195,10 @@ public float getLastProb() {
174195 public List <Float > getProbs () {
175196 return probs ;
176197 }
198+
199+ public int getFrameCounter () {
200+ return frameCounter ;
201+ }
177202
178203 public String getDetectedDeviceType () {
179204 return detectedDeviceType ;
@@ -286,6 +311,8 @@ public void reset() {
286311 silenceTime = 0 ;
287312 avgEnergy = 0 ;
288313 probs .clear ();
314+ originalProbs .clear (); // 重置原始概率列表
315+ frameCounter = 0 ; // 重置帧计数器
289316 preBuffer .clear ();
290317 preBufferSize = 0 ;
291318 pcmData .clear ();
@@ -386,17 +413,29 @@ public VadResult processAudio(String sessionId, byte[] opusData) {
386413 return new VadResult (VadStatus .ERROR , null );
387414 }
388415
416+ // 分析原始音频
417+ float [] originalSamples = bytesToFloats (pcmData );
418+ float originalEnergy = calcEnergy (originalSamples );
419+
420+ // 获取原始VAD概率
421+ float originalSpeechProb = detectSpeech (originalSamples );
422+ state .addOriginalProb (originalSpeechProb );
423+
389424 // ========== 应用 AGC ==========
390425 String deviceType = state .getDetectedDeviceType ();
391426 byte [] originalPcm = pcmData .clone (); // 保留原始数据用于对比
392- pcmData = agc .process (sessionId , pcmData , deviceType );
427+
428+ // 检查AGC是否可用
429+ if (agc != null ) {
430+ pcmData = agc .process (sessionId , pcmData , deviceType );
431+ }
393432
394433 // 获取AGC统计信息
395- AutomaticGainControl .AgcStats agcStats = agc .getStats (sessionId );
434+ AutomaticGainControl .AgcStats agcStats = agc != null ? agc .getStats (sessionId ) : new AutomaticGainControl . AgcStats ( );
396435
397436 // 根据AGC增益动态调整VAD阈值
398- speechThreshold = adjustVadThreshold (speechThreshold , agcStats );
399- silenceThreshold = adjustVadThreshold (silenceThreshold , agcStats );
437+ float adjustedSpeechThreshold = adjustVadThreshold (speechThreshold , agcStats );
438+ float adjustedSilenceThreshold = adjustVadThreshold (silenceThreshold , agcStats );
400439
401440 // 添加到预缓冲区
402441 state .addToPreBuffer (pcmData );
@@ -415,21 +454,22 @@ public VadResult processAudio(String sessionId, byte[] opusData) {
415454 if (pcmData .length == 0 ) {
416455 return new VadResult (VadStatus .NO_SPEECH , null );
417456 }
457+
418458 }
419459
420- // 分析音频
460+ // 分析AGC处理后的音频
421461 float [] samples = bytesToFloats (pcmData );
422462 float energy = calcEnergy (samples );
423463 state .updateEnergy (energy );
424464
425- // VAD推断
465+ // VAD推断(AGC后)
426466 float speechProb = detectSpeech (samples );
427467 state .addProb (speechProb );
428468
429469 // 判断语音状态
430470 boolean hasEnergy = energy > state .getAvgEnergy () * 1.5 && energy > energyThreshold ;
431- boolean isSpeech = speechProb > speechThreshold && hasEnergy ;
432- boolean isSilence = speechProb < silenceThreshold ;
471+ boolean isSpeech = speechProb > adjustedSpeechThreshold && hasEnergy ;
472+ boolean isSilence = speechProb < adjustedSilenceThreshold ;
433473 state .updateSilence (isSilence );
434474
435475 // 处理状态转换
@@ -443,9 +483,9 @@ public VadResult processAudio(String sessionId, byte[] opusData) {
443483 agcInfo = String .format (", AGC增益: %.2f, 设备类型: %s" ,
444484 agcStats .gain , state .getDetectedDeviceType ());
445485
446- logger .info ("检测到语音开始 - SessionId: {}, 概率: {}, 能量: {}, " +
486+ logger .info ("检测到语音开始 - SessionId: {}, 概率: {}, 原始概率: {}, 能量: {}, " +
447487 "调整后阈值: {}{}" ,
448- sessionId , speechProb , energy , speechThreshold , agcInfo );
488+ sessionId , speechProb , originalSpeechProb , energy , adjustedSpeechThreshold , agcInfo );
449489
450490 // 获取预缓冲数据
451491 byte [] preBufferData = state .drainPreBuffer ();
@@ -486,7 +526,7 @@ public VadResult processAudio(String sessionId, byte[] opusData) {
486526 return new VadResult (VadStatus .NO_SPEECH , null );
487527 }
488528 } catch (Exception e ) {
489- logger .error ("处理音频失败: {}" , sessionId , e );
529+ logger .error ("处理音频失败: {}, 错误: {} " , sessionId , e . getMessage () , e );
490530 return new VadResult (VadStatus .ERROR , null );
491531 }
492532 }
@@ -632,6 +672,17 @@ public float getSpeechProbability(String sessionId) {
632672 return state != null ? state .getLastProb () : 0.0f ;
633673 }
634674 }
675+
676+ /**
677+ * 获取原始语音概率
678+ */
679+ public float getOriginalSpeechProbability (String sessionId ) {
680+ Object lock = getLock (sessionId );
681+ synchronized (lock ) {
682+ VadState state = states .get (sessionId );
683+ return state != null ? state .getLastOriginalProb () : 0.0f ;
684+ }
685+ }
635686
636687 /**
637688 * 获取音频数据
@@ -687,6 +738,17 @@ public void setDeviceType(String sessionId, String deviceType) {
687738 }
688739 }
689740
741+ /**
742+ * 获取当前帧计数
743+ */
744+ public int getFrameCounter (String sessionId ) {
745+ Object lock = getLock (sessionId );
746+ synchronized (lock ) {
747+ VadState state = states .get (sessionId );
748+ return state != null ? state .getFrameCounter () : 0 ;
749+ }
750+ }
751+
690752 /**
691753 * VAD状态枚举
692754 */
0 commit comments