Skip to content

Added Segment Silence Detection #48

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions pkg/segmenter/opt.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package segmenter

import (
"time"

// Packages
media "github.com/mutablelogic/go-media"
)

///////////////////////////////////////////////////////////////////////////////////
// TYPES

type Opt func(*opts) error

type opts struct {
SilenceThreshold float64 // Silence threshold
SilenceDuration time.Duration // Duration of silence to consider a segment boundary
}

///////////////////////////////////////////////////////////////////////////////////
// GLOBALS

const (
DefaultSilenceThreshold = 0.0005 // Default silence threshold
DefaultSilenceDuration = time.Second * 2 // Default silence duration
)

///////////////////////////////////////////////////////////////////////////////////
// LIFECYCLE

func applyOpts(opt ...Opt) (*opts, error) {
var o opts
for _, fn := range opt {
if err := fn(&o); err != nil {
return nil, err
}
}
return &o, nil
}

///////////////////////////////////////////////////////////////////////////////////
// TYPES

func WithDefaultSilenceThreshold() Opt {
return func(o *opts) error {
o.SilenceThreshold = DefaultSilenceThreshold
o.SilenceDuration = DefaultSilenceDuration
return nil
}
}

func WithSilenceDuration(v time.Duration) Opt {
return func(o *opts) error {
if v < time.Millisecond*100 {
return media.ErrBadParameter.Withf("silence duration %s is too short, must be at least 100ms", v)
} else {
o.SilenceDuration = v
}
return nil
}
}
166 changes: 96 additions & 70 deletions pkg/segmenter/segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"io"
"math"
"time"

// Packages
Expand All @@ -17,7 +18,9 @@ import (
// A segmenter reads audio samples from a reader and segments them into
// fixed-size chunks. The segmenter can be used to process audio samples
type Segmenter struct {
opts
ts time.Duration
sts float64 // silence timestamps
sample_rate int
n int
buf_flt []float32
Expand All @@ -33,6 +36,13 @@ type SegmentFuncFloat32 func(time.Duration, []float32) error
// segment of audio samples. The first argument is the timestamp of the segment.
type SegmentFuncInt16 func(time.Duration, []int16) error

//////////////////////////////////////////////////////////////////////////////
// GLOBALS

const (
Int16Gain = float64(math.MaxInt16) // Gain for converting int16 to float32
)

//////////////////////////////////////////////////////////////////////////////
// LIFECYCLE

Expand All @@ -43,9 +53,16 @@ type SegmentFuncInt16 func(time.Duration, []int16) error
//
// At the moment, the audio format is auto-detected, but there should be
// a way to specify the audio format.
func NewReader(r io.Reader, dur time.Duration, sample_rate int) (*Segmenter, error) {
func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*Segmenter, error) {
segmenter := new(Segmenter)

// Apply options
if o, err := applyOpts(opts...); err != nil {
return nil, err
} else {
segmenter.opts = *o
}

// Check arguments
if dur < 0 || sample_rate <= 0 {
return nil, media.ErrBadParameter.With("invalid duration or sample rate arguments")
Expand Down Expand Up @@ -108,22 +125,53 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
}

// Allocate the buffer
if s.n > 0 {
s.buf_flt = make([]float32, 0, s.n)
}
s.buf_flt = make([]float32, 0, s.n)

// Reset the silence timestamp
s.sts = -1

// Decode samples and segment
if err := s.reader.Decode(ctx, mapFunc, func(stream int, frame *ffmpeg.Frame) error {
// We get null frames sometimes, ignore them
// Ignore null frames
if frame == nil {
return nil
}

// Return if the frame is empty
data := frame.Float32(0)
if len(data) == 0 {
return nil
}

// Calculate the energy of the frame - root mean squared and normalize between 0 and 1
var sum float32
var energy float64
for _, sample := range data {
sum += float32(sample) * float32(sample)
}
energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)

// If silence detection is enabled, check if the energy is below the threshold
var cut bool
if s.SilenceThreshold > 0 && energy < s.SilenceThreshold {
// If the energy is below the threshold, we consider it silence
if s.sts == -1 {
// If this is the first silence, set the timestamp
s.sts = frame.Ts()
} else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() {
// Cut when the buffer size is greater than 10 seconds
if len(s.buf_flt) >= s.sample_rate*10 {
cut = true
}
s.sts = -1 // Reset the silence timestamp
}
}

// Append float32 samples from plane 0 to buffer
s.buf_flt = append(s.buf_flt, frame.Float32(0)...)

// n != 0 and len(buf) >= n we have a segment to process
if s.n != 0 && len(s.buf_flt) >= s.n {
if (s.n != 0 && len(s.buf_flt) >= s.n) || cut {
if err := s.segment_flt(fn); err != nil {
return err
}
Expand Down Expand Up @@ -173,25 +221,57 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
}

// Allocate the buffer
if s.n > 0 {
s.buf_s16 = make([]int16, 0, s.n)
}
s.buf_s16 = make([]int16, 0, s.n)

// Reset the silence timestamp
s.sts = -1

// Decode samples and segment
if err := s.reader.Decode(ctx, mapFunc, func(stream int, frame *ffmpeg.Frame) error {
// We get null frames sometimes, ignore them
// Ignore null frames
if frame == nil {
return nil
}

// Return if the frame is empty
data := frame.Int16(0)
if len(data) == 0 {
return nil
}

// Calculate the energy of the frame - root mean squared and normalize between 0 and 1
var sum float32
var energy float64
for _, sample := range data {
sum += float32(sample) * float32(sample)
}
energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)

// If silence detection is enabled, check if the energy is below the threshold
var cut bool
if s.SilenceThreshold > 0 && energy < s.SilenceThreshold {
// If the energy is below the threshold, we consider it silence
if s.sts == -1 {
// If this is the first silence, set the timestamp
s.sts = frame.Ts()
} else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() {
// Cut when the buffer size is greater than 10 seconds
if len(s.buf_s16) >= s.sample_rate*10 {
cut = true
}
s.sts = -1 // Reset the silence timestamp
}
}

// Append int16 samples from plane 0 to buffer
s.buf_s16 = append(s.buf_s16, frame.Int16(0)...)
s.buf_s16 = append(s.buf_s16, data...)

// n != 0 and len(buf) >= n we have a segment to process
if s.n != 0 && len(s.buf_s16) >= s.n {
if (s.n != 0 && len(s.buf_s16) >= s.n) || cut {
if err := s.segment_s16(fn); err != nil {
return err
}

// Increment the timestamp
s.ts += time.Duration(len(s.buf_s16)) * time.Second / time.Duration(s.sample_rate)

Expand Down Expand Up @@ -223,65 +303,11 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
// PRIVATE METHODS

func (s *Segmenter) segment_flt(fn SegmentFuncFloat32) error {
// Not segmenting
if s.n == 0 {
return fn(s.ts, s.buf_flt)
}

// Split into n-sized segments
bufLength := len(s.buf_flt)
ts := s.ts
tsinc := time.Duration(s.n) * time.Second / time.Duration(s.sample_rate)
for i := 0; i < bufLength; i += s.n {
end := i + s.n
var segment []float32
if end <= bufLength {
// If the segment fits exactly or there are enough items
segment = s.buf_flt[i:end]
} else {
// If the segment is smaller than segmentSize, pad with zeros
segment = make([]float32, s.n)
copy(segment, s.buf_flt[i:bufLength])
}
if err := fn(ts, segment); err != nil {
return err
} else {
ts += tsinc
}
}

// Return success
return nil
// TODO: Pad any remaining samples with zeros if the buffer is not full
return fn(s.ts, s.buf_flt)
}

func (s *Segmenter) segment_s16(fn SegmentFuncInt16) error {
// Not segmenting
if s.n == 0 {
return fn(s.ts, s.buf_s16)
}

// Split into n-sized segments
bufLength := len(s.buf_s16)
ts := s.ts
tsinc := time.Duration(s.n) * time.Second / time.Duration(s.sample_rate)
for i := 0; i < bufLength; i += s.n {
end := i + s.n
var segment []int16
if end <= bufLength {
// If the segment fits exactly or there are enough items
segment = s.buf_s16[i:end]
} else {
// If the segment is smaller than segmentSize, pad with zeros
segment = make([]int16, s.n)
copy(segment, s.buf_s16[i:bufLength])
}
if err := fn(ts, segment); err != nil {
return err
} else {
ts += tsinc
}
}

// Return success
return nil
// TODO: Pad any remaining samples with zeros if the buffer is not full
return fn(s.ts, s.buf_s16)
}
Loading