diff --git a/ClassTranscribeDatabase/global.json b/ClassTranscribeDatabase/global.json index 4100a4a8..215288b9 100644 --- a/ClassTranscribeDatabase/global.json +++ b/ClassTranscribeDatabase/global.json @@ -1,5 +1,5 @@ { "sdk": { - "version": "8.0.201" + "version": "8.0" } } \ No newline at end of file diff --git a/ClassTranscribeServer/global.json b/ClassTranscribeServer/global.json index a679dd12..215288b9 100644 --- a/ClassTranscribeServer/global.json +++ b/ClassTranscribeServer/global.json @@ -1,5 +1,5 @@ { "sdk": { - "version": "8.0.401" + "version": "8.0" } } \ No newline at end of file diff --git a/PythonRpcServer/randomvoice_16kHz.json b/PythonRpcServer/randomvoice_16kHz.json new file mode 100644 index 00000000..c3053a9b --- /dev/null +++ b/PythonRpcServer/randomvoice_16kHz.json @@ -0,0 +1 @@ +{"text": " Hello? Hello? Hello?", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 3.0, "text": " Hello? Hello? Hello?", "tokens": [50363, 18435, 30, 18435, 30, 18435, 30, 50513], "temperature": 0.0, "avg_logprob": -0.636968559688992, "compression_ratio": 1.1764705882352942, "no_speech_prob": 0.22877301275730133}], "language": "en"} \ No newline at end of file diff --git a/PythonRpcServer/randomvoice_16kHz.wav b/PythonRpcServer/randomvoice_16kHz.wav new file mode 100644 index 00000000..fd2a335f Binary files /dev/null and b/PythonRpcServer/randomvoice_16kHz.wav differ diff --git a/PythonRpcServer/server.py b/PythonRpcServer/server.py index 943c2c5c..09d20385 100644 --- a/PythonRpcServer/server.py +++ b/PythonRpcServer/server.py @@ -41,6 +41,18 @@ def LogWorker(logId, worker): class PythonServerServicer(ct_pb2_grpc.PythonServerServicer): + # Transcribe it into a json string from the transcribe text + # Make it returns a json string + # change name to TranscribeRPC + def CaptionRPC(self, request, context): + #See CaptionRequest + print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})") + kalturaprovider = KalturaProvider() + result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId)) + return ct_pb2.JsonString(json = result) + + + def GetScenesRPC(self, request, context): raise NotImplementedError('Implementation now in pyapi') # res = scenedetector.find_scenes(request.filePath) diff --git a/PythonRpcServer/transcribe.py b/PythonRpcServer/transcribe.py new file mode 100644 index 00000000..3a024d3f --- /dev/null +++ b/PythonRpcServer/transcribe.py @@ -0,0 +1,57 @@ +import subprocess +import os +import json +import re + +def transcribe_audio_with_whisper(audio_file_path): + if not os.path.exists(audio_file_path): + raise FileNotFoundError(f"Audio file {audio_file_path} does not exist.") + + command = [ + "whisper", + audio_file_path, + "--model", "base.en", + "--output_format", "json" + ] + + try: + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True) + + print("Whisper Output:") + print(result.stdout) + + formatted_data = {"en": []} + + segments = result.stdout.strip().split('\n\n') + for segment in segments: + match = re.search(r'\[(\d+:\d+\.\d+)\s+-->\s+(\d+:\d+\.\d+)\]\s+(.*)', segment) + if match: + start_time = match.group(1) + end_time = match.group(2) + text = match.group(3).strip() + + formatted_data["en"].append({ + "starttime": start_time, + "endtime": end_time, + "caption": text + }) + + return formatted_data + + except subprocess.CalledProcessError as e: + print(f"Error during transcription: {e.stderr}") + return None + + except Exception as e: + print(f"An unexpected error occurred: {e}") + return None + +if __name__ == "__main__": + audio_file = "randomvoice_16kHz.wav" + + transcription = transcribe_audio_with_whisper(audio_file) + + if transcription: + print(json.dumps(transcription, indent=4)) + else: + print("Transcription failed.") \ No newline at end of file diff --git a/TaskEngine.Dockerfile b/TaskEngine.Dockerfile index 6ff8d946..a99244c9 100644 --- a/TaskEngine.Dockerfile +++ b/TaskEngine.Dockerfile @@ -1,6 +1,8 @@ FROM mcr.microsoft.com/dotnet/sdk:8.0-bookworm-slim as build # See https://mcr.microsoft.com/en-us/product/dotnet/sdk/tags #See more comments in API.Dockerfile +# RUN ls +RUN dotnet --list-sdks WORKDIR / RUN git clone https://github.com/eficode/wait-for.git @@ -8,6 +10,8 @@ RUN git clone https://github.com/eficode/wait-for.git WORKDIR /src COPY ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj # --verbosity normal|diagnostic + + RUN dotnet restore --verbosity diagnostic ./ClassTranscribeDatabase/ClassTranscribeDatabase.csproj COPY ./TaskEngine/TaskEngine.csproj ./TaskEngine/TaskEngine.csproj diff --git a/TaskEngine/Program.cs b/TaskEngine/Program.cs index a8e1c405..ebe513e5 100644 --- a/TaskEngine/Program.cs +++ b/TaskEngine/Program.cs @@ -81,7 +81,8 @@ public static void SetupServices() .AddSingleton() .AddSingleton() .AddSingleton() - .AddSingleton() + .AddSingleton() + .AddSingleton() .AddSingleton() // .AddSingleton() .AddSingleton() @@ -175,7 +176,7 @@ static void createTaskQueues() { // Transcription Related _logger.LogInformation($"Creating TranscriptionTask consumers. Concurrency={concurrent_transcriptions} "); - _serviceProvider.GetService().Consume(concurrent_transcriptions); + _serviceProvider.GetService().Consume(concurrent_transcriptions); // no more! - _serviceProvider.GetService().Consume(concurrent_transcriptions); diff --git a/TaskEngine/Tasks/TranscriptionTask.cs b/TaskEngine/Tasks/AzureTranscriptionTask.cs similarity index 98% rename from TaskEngine/Tasks/TranscriptionTask.cs rename to TaskEngine/Tasks/AzureTranscriptionTask.cs index 217fa633..6bf250ad 100644 --- a/TaskEngine/Tasks/TranscriptionTask.cs +++ b/TaskEngine/Tasks/AzureTranscriptionTask.cs @@ -21,7 +21,7 @@ namespace TaskEngine.Tasks /// This task produces the transcriptions for a Video item. /// [SuppressMessage("Microsoft.Performance", "CA1812:MarkMembersAsStatic")] // This class is never directly instantiated - class TranscriptionTask : RabbitMQTask + class AzureTranscriptionTask : RabbitMQTask { private readonly MSTranscriptionService _msTranscriptionService; @@ -29,9 +29,9 @@ class TranscriptionTask : RabbitMQTask private readonly CaptionQueries _captionQueries; - public TranscriptionTask(RabbitMQConnection rabbitMQ, MSTranscriptionService msTranscriptionService, + public AzureTranscriptionTask(RabbitMQConnection rabbitMQ, MSTranscriptionService msTranscriptionService, // GenerateVTTFileTask generateVTTFileTask, - ILogger logger, CaptionQueries captionQueries) + ILogger logger, CaptionQueries captionQueries) : base(rabbitMQ, TaskType.TranscribeVideo, logger) { _msTranscriptionService = msTranscriptionService; diff --git a/TaskEngine/Tasks/ConvertVideoToWavTask.cs b/TaskEngine/Tasks/ConvertVideoToWavTask.cs index a8e2f363..5ec7475c 100644 --- a/TaskEngine/Tasks/ConvertVideoToWavTask.cs +++ b/TaskEngine/Tasks/ConvertVideoToWavTask.cs @@ -21,13 +21,13 @@ namespace TaskEngine.Tasks class ConvertVideoToWavTask : RabbitMQTask { private readonly RpcClient _rpcClient; - private readonly TranscriptionTask _transcriptionTask; + private readonly LocalTranscriptionTask _localTranscriptionTask; - public ConvertVideoToWavTask(RabbitMQConnection rabbitMQ, RpcClient rpcClient, TranscriptionTask transcriptionTask, ILogger logger) + public ConvertVideoToWavTask(RabbitMQConnection rabbitMQ, RpcClient rpcClient, LocalTranscriptionTask localTranscriptionTask, ILogger logger) : base(rabbitMQ, TaskType.ConvertMedia, logger) { _rpcClient = rpcClient; - _transcriptionTask = transcriptionTask; + _localTranscriptionTask = localTranscriptionTask; } protected override Task OnConsume(string videoId, TaskParameters taskParameters, ClientActiveTasks cleanup) @@ -72,11 +72,10 @@ private async Task OldOnConsumeNotUsed(string videoId) videoLatest.Audio = fileRecord; await _context.SaveChangesAsync(); - // If no transcriptions present, produce transcriptions. if (!videoLatest.Transcriptions.Any()) { - _transcriptionTask.Publish(videoLatest.Id); + _localTranscriptionTask.Publish(videoLatest.Id); } } } diff --git a/TaskEngine/Tasks/LocalTranscriptionTask.cs b/TaskEngine/Tasks/LocalTranscriptionTask.cs new file mode 100644 index 00000000..0352ce3c --- /dev/null +++ b/TaskEngine/Tasks/LocalTranscriptionTask.cs @@ -0,0 +1,187 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Logging; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading.Tasks; +using Grpc.Core; +using Newtonsoft.Json.Linq; + + +using ClassTranscribeDatabase; +using ClassTranscribeDatabase.Models; +using ClassTranscribeDatabase.Services; + +using static ClassTranscribeDatabase.CommonUtils; + +#pragma warning disable CA2007 +// https://learn.microsoft.com/en-us/dotnet/fundamentals/code-analysis/quality-rules/ca2007 +// We are okay awaiting on a task in the same thread + +namespace TaskEngine.Tasks +{ + /// + /// This task produces the transcriptions for a Video item. + /// + [SuppressMessage("Microsoft.Performance", "CA1812:MarkMembersAsStatic")] // This class is never directly instantiated + class LocalTranscriptionTask : RabbitMQTask + { + + private readonly CaptionQueries _captionQueries; + private readonly RpcClient _rpcClient; + + + public LocalTranscriptionTask(RabbitMQConnection rabbitMQ, + RpcClient rpcClient, + // GenerateVTTFileTask generateVTTFileTask, + ILogger logger, CaptionQueries captionQueries) + : base(rabbitMQ, TaskType.TranscribeVideo, logger) + { + _rpcClient = rpcClient; + _captionQueries = captionQueries; + } + + protected async override Task OnConsume(string videoId, TaskParameters taskParameters, ClientActiveTasks cleanup) + { + RegisterTask(cleanup, videoId); // may throw AlreadyInProgress exception + + const string SOURCEINTERNALREF= "ClassTranscribe/Local"; // Do not change me; this is a key inside the database + // to indicate the source of the captions was this code + + + using (var _context = CTDbContext.CreateDbContext()) + { + + // TODO: taskParameters.Force should wipe all captions and reset the Transcription Status + + Video video = await _context.Videos.Include(v => v.Video1).Where(v => v.Id == videoId).FirstAsync(); + // ! Note the 'Include' ; we don't build the whole tree of related Entities + + if (video.TranscriptionStatus == Video.TranscriptionStatusMessages.NOERROR) + { + GetLogger().LogInformation($"{videoId}:Skipping Transcribing of- already complete"); + return; + } + var medias = await _context.Medias.Include(m=>m.Playlist).Where(m=>m.VideoId == videoId && m.Playlist != null).ToListAsync(); + if(medias.Count == 0) { + GetLogger().LogInformation($"{videoId}:Skipping Transcribing - no media / playlist cares about this video"); + return; + } + + GetLogger().LogInformation($"{videoId}: Has new Phrase Hints: {video.HasPhraseHints()}"); + + string phraseHints = ""; + if (video.HasPhraseHints()) { + var data = await _context.TextData.FindAsync(video.PhraseHintsDataId); + phraseHints = data.Text; + } else + { // deprecated + phraseHints = video.PhraseHints ?? ""; + } + + GetLogger().LogInformation($"{videoId}:Using Phrase Hints length = {phraseHints.Length}"); + // GetKey can throw if the video.Id is currently being transcribed + // However registerTask should have already detected that + var key = TaskEngineGlobals.KeyProvider.GetKey(video.Id); + + video.TranscribingAttempts += 10; + await _context.SaveChangesAsync(); + GetLogger().LogInformation($"{videoId}: Updated TranscribingAttempts = {video.TranscribingAttempts}"); + try + { + + GetLogger().LogInformation($"{videoId}: Calling RecognitionWithVideoStreamAsync"); + + var request = new CTGrpc.TranscriptionRequest + { + LogId = videoId, + FilePath = video.Video1.VMPath, + Model = "en", + Language = "en" + // PhraseHints = phraseHints, + // CourseHints = "", + // OutputLanguages = "en" + }; + var jsonString = ""; + try { + jsonString = (await _rpcClient.PythonServerClient.TranscribeAudioRPCAsync(request)).Json; + } + catch (RpcException e) + { + if (e.Status.StatusCode == StatusCode.InvalidArgument) + { + GetLogger().LogError($"TranscribeAudioRPCAsync=({videoId}):{e.Message}"); + } + return; + } finally { + GetLogger().LogInformation($"{videoId} Transcribe - rpc complete"); + TaskEngineGlobals.KeyProvider.ReleaseKey(key, video.Id); + } + + JObject jObject = JObject.Parse(jsonString); + // JArray jArray = JArray.Parse(jsonString); + var theLanguage = jObject["result"]["language"].ToString(Newtonsoft.Json.Formatting.None); + var theCaptionsAsJson = jObject["transcription"]; + + var theCaptions = new List(); + int cueCount = 0; + + foreach (var jsonCue in theCaptionsAsJson) { + var caption = new Caption() { + Index = cueCount ++, + Begin = TimeSpan.Parse(jsonCue["timestamps"]["from"].ToString(Newtonsoft.Json.Formatting.None)), + End = TimeSpan.Parse(jsonCue["timestamps"]["to"].ToString(Newtonsoft.Json.Formatting.None)) , + Text = jsonCue["text"] .ToString(Newtonsoft.Json.Formatting.None) + }; + + theCaptions.Add(caption); + } + if (theCaptions.Count > 0) + { + GetLogger().LogInformation($"{videoId}: Created {theCaptions.Count} captions objects"); + + var t = _context.Transcriptions.SingleOrDefault(t => t.VideoId == video.Id && t.SourceInternalRef == SOURCEINTERNALREF && t.Language == theLanguage && t.TranscriptionType == TranscriptionType.Caption); + GetLogger().LogInformation($"Find Existing Transcriptions null={t == null}"); + // Did we get the default or an existing Transcription entity? + if (t == null) + { + t = new Transcription() + { + TranscriptionType = TranscriptionType.Caption, + Captions = theCaptions, + Language = theLanguage, + VideoId = video.Id, + Label = $"{theLanguage} (ClassTranscribe)", + SourceInternalRef = SOURCEINTERNALREF, // + SourceLabel = "ClassTranscribe (Local" + (phraseHints.Length>0 ?" with phrase hints)" : ")") + // Todo store the entire Whisper result here + }; + _context.Add(t); + } + else + { + t.Captions.AddRange(theCaptions); + } + } + + + video.TranscriptionStatus = "NoError"; + // video.JsonMetadata["LastSuccessfulTime"] = result.LastSuccessTime.ToString(); + + GetLogger().LogInformation($"{videoId}: Saving captions"); + await _context.SaveChangesAsync(); + } + catch (Exception ex) + { + GetLogger().LogError(ex, $"{videoId}: Transcription Exception:${ex.StackTrace}"); + video.TranscribingAttempts += 1000; + await _context.SaveChangesAsync(); + throw; + } + + } + } + + } +} \ No newline at end of file diff --git a/TaskEngine/Tasks/QueueAwakerTask.cs b/TaskEngine/Tasks/QueueAwakerTask.cs index ed1d7225..3c4b0d2d 100644 --- a/TaskEngine/Tasks/QueueAwakerTask.cs +++ b/TaskEngine/Tasks/QueueAwakerTask.cs @@ -22,7 +22,7 @@ class QueueAwakerTask : RabbitMQTask private readonly DownloadPlaylistInfoTask _downloadPlaylistInfoTask; private readonly DownloadMediaTask _downloadMediaTask; // private readonly ConvertVideoToWavTask _convertVideoToWavTask; - private readonly TranscriptionTask _transcriptionTask; + private readonly LocalTranscriptionTask _transcriptionTask; // nope private readonly GenerateVTTFileTask _generateVTTFileTask; private readonly ProcessVideoTask _processVideoTask; private readonly SceneDetectionTask _sceneDetectionTask; @@ -39,7 +39,7 @@ public QueueAwakerTask() { } public QueueAwakerTask(RabbitMQConnection rabbitMQ, DownloadPlaylistInfoTask downloadPlaylistInfoTask, DownloadMediaTask downloadMediaTask, - TranscriptionTask transcriptionTask, ProcessVideoTask processVideoTask, + LocalTranscriptionTask transcriptionTask, ProcessVideoTask processVideoTask, // GenerateVTTFileTask generateVTTFileTask, SceneDetectionTask sceneDetectionTask, CreateBoxTokenTask createBoxTokenTask,// UpdateBoxTokenTask updateBoxTokenTask, diff --git a/TaskEngine/Tasks/SceneDetectionTask.cs b/TaskEngine/Tasks/SceneDetectionTask.cs index 1baf80c0..711bc92c 100644 --- a/TaskEngine/Tasks/SceneDetectionTask.cs +++ b/TaskEngine/Tasks/SceneDetectionTask.cs @@ -19,13 +19,13 @@ namespace TaskEngine.Tasks class SceneDetectionTask : RabbitMQTask { private readonly RpcClient _rpcClient; - private readonly TranscriptionTask _transcriptionTask; + private readonly LocalTranscriptionTask _transcriptionTask; - public SceneDetectionTask(RabbitMQConnection rabbitMQ,TranscriptionTask transcriptionTask, RpcClient rpcClient, ILogger logger) + public SceneDetectionTask(RabbitMQConnection rabbitMQ,LocalTranscriptionTask localTanscriptionTask, RpcClient rpcClient, ILogger logger) : base(rabbitMQ, TaskType.SceneDetection, logger) { _rpcClient = rpcClient; - _transcriptionTask = transcriptionTask; + _transcriptionTask = localTanscriptionTask; } /// Extracts scene information for a video. /// Beware: It is possible to start another scene task while the first one is still running diff --git a/TaskEngine/TempCode.cs b/TaskEngine/TempCode.cs index 896d2f72..34af0142 100644 --- a/TaskEngine/TempCode.cs +++ b/TaskEngine/TempCode.cs @@ -24,7 +24,7 @@ class TempCode private readonly PythonCrawlerTask _pythonCrawlerTask; private readonly ProcessVideoTask _processVideoTask; // private readonly GenerateVTTFileTask _generateVTTFileTask; - private readonly TranscriptionTask _transcriptionTask; + private readonly LocalTranscriptionTask _transcriptionTask; private readonly ConvertVideoToWavTask _convertVideoToWavTask; private readonly DownloadMediaTask _downloadMediaTask; private readonly DownloadPlaylistInfoTask _downloadPlaylistInfoTask; @@ -34,7 +34,7 @@ class TempCode public TempCode(CTDbContext c, CreateBoxTokenTask createBoxTokenTask, //UpdateBoxTokenTask updateBoxTokenTask, SceneDetectionTask ePubGeneratorTask, ProcessVideoTask processVideoTask, - TranscriptionTask transcriptionTask, ConvertVideoToWavTask convertVideoToWavTask, DownloadMediaTask downloadMediaTask, + LocalTranscriptionTask localTranscriptionTask, ConvertVideoToWavTask convertVideoToWavTask, DownloadMediaTask downloadMediaTask, DownloadPlaylistInfoTask downloadPlaylistInfoTask, QueueAwakerTask queueAwakerTask, CleanUpElasticIndexTask cleanUpElasticIndexTask, RpcClient rpcClient, PythonCrawlerTask pythonCrawlerTask) @@ -45,7 +45,7 @@ public TempCode(CTDbContext c, CreateBoxTokenTask createBoxTokenTask, //UpdateBo _sceneDetectionTask = ePubGeneratorTask; _processVideoTask = processVideoTask; // _generateVTTFileTask = generateVTTFileTask; - _transcriptionTask = transcriptionTask; + _transcriptionTask = localTranscriptionTask; _convertVideoToWavTask = convertVideoToWavTask; _downloadMediaTask = downloadMediaTask; _downloadPlaylistInfoTask = downloadPlaylistInfoTask; diff --git a/TaskEngine/global.json b/TaskEngine/global.json index a679dd12..215288b9 100644 --- a/TaskEngine/global.json +++ b/TaskEngine/global.json @@ -1,5 +1,5 @@ { "sdk": { - "version": "8.0.401" + "version": "8.0" } } \ No newline at end of file diff --git a/ct.proto b/ct.proto index 512975ec..e16853c6 100644 --- a/ct.proto +++ b/ct.proto @@ -20,6 +20,15 @@ service PythonServer { rpc ComputeFileHash (FileHashRequest) returns (FileHashResponse) {} rpc GetMediaInfoRPC(File) returns (JsonString) {} + + rpc TranscribeAudioRPC (TranscriptionRequest) returns (JsonString) {} +} + +message TranscriptionRequest { + string filePath = 1; // Path to the audio/video file to be transcribed + string model = 2; // Whisper model to use (e.g., 'base-en', 'tiny-en') + string language = 3; // Language in audio. + string logId = 4; } @@ -31,7 +40,6 @@ message JsonString { // The response message containing the greetings. message PlaylistRequest { string Url = 1; - int32 stream = 2; JsonString metadata = 3; } diff --git a/pythonrpcserver.Dockerfile b/pythonrpcserver.Dockerfile index 443ccc6d..8824aee8 100644 --- a/pythonrpcserver.Dockerfile +++ b/pythonrpcserver.Dockerfile @@ -9,9 +9,9 @@ RUN git clone https://github.com/ggerganov/whisper.cpp . && make RUN bash ./models/download-ggml-model.sh base.en - # ------------------------------ - # Stage 2: Setup Python RPC Server - # ------------------------------ +# ------------------------------ +# Stage 2: Setup Python RPC Server +# ------------------------------ FROM --platform=linux/amd64 python:3.8.15-slim-buster AS rpcserver RUN apt-get update && \ apt-get install -y curl gcc g++ make libglib2.0-0 libsm6 libxext6 libxrender-dev ffmpeg diff --git a/randomvoice_16kHz.json b/randomvoice_16kHz.json new file mode 100644 index 00000000..c3053a9b --- /dev/null +++ b/randomvoice_16kHz.json @@ -0,0 +1 @@ +{"text": " Hello? Hello? Hello?", "segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 3.0, "text": " Hello? Hello? Hello?", "tokens": [50363, 18435, 30, 18435, 30, 18435, 30, 50513], "temperature": 0.0, "avg_logprob": -0.636968559688992, "compression_ratio": 1.1764705882352942, "no_speech_prob": 0.22877301275730133}], "language": "en"} \ No newline at end of file diff --git a/whisper.cpp b/whisper.cpp new file mode 160000 index 00000000..5236f027 --- /dev/null +++ b/whisper.cpp @@ -0,0 +1 @@ +Subproject commit 5236f0278420ab776d1787c4330678d80219b4b6