diff options
| author | Malte Voos <git@mal.tc> | 2025-12-05 15:35:38 +0100 |
|---|---|---|
| committer | Malte Voos <git@mal.tc> | 2025-12-05 15:43:58 +0100 |
| commit | c347b6133365dcf1b7da4e77890b20d04d6cfba4 (patch) | |
| tree | c83aac6f7d1e6edc57e607f01e5d3eeee8da4a0e /src/subtitles/extraction | |
| parent | 652b1c2a0ce7db4885ebc51f7f09133a43401442 (diff) | |
| download | lleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.tar.gz lleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.zip | |
Diffstat (limited to 'src/subtitles/extraction')
| -rw-r--r-- | src/subtitles/extraction/embedded.rs | 116 | ||||
| -rw-r--r-- | src/subtitles/extraction/mod.rs | 153 | ||||
| -rw-r--r-- | src/subtitles/extraction/whisper.rs | 139 |
3 files changed, 408 insertions, 0 deletions
diff --git a/src/subtitles/extraction/embedded.rs b/src/subtitles/extraction/embedded.rs new file mode 100644 index 0000000..920f52b --- /dev/null +++ b/src/subtitles/extraction/embedded.rs @@ -0,0 +1,116 @@ +use std::sync::mpsc; + +use anyhow::Context; + +use crate::{subtitles::SubtitleCue, subtitles::extraction::*}; + +pub fn extract_embedded_subtitles( + // stream index to use when storing extracted subtitles, this index already + // has to be in TRACKS when this function is called! + stream_ix: StreamIndex, + context: ffmpeg::codec::Context, + time_base: ffmpeg::Rational, + packet_rx: mpsc::Receiver<ffmpeg::Packet>, + sender: ComponentSender<SubtitleExtractor>, +) -> anyhow::Result<()> { + let mut decoder = context + .decoder() + .subtitle() + .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?; + + while let Ok(packet) = packet_rx.recv() { + let mut subtitle = ffmpeg::Subtitle::new(); + match decoder.decode(&packet, &mut subtitle) { + Ok(true) => { + if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) { + sender + .output(SubtitleExtractorOutput::NewCue(stream_ix, cue)) + .unwrap(); + } else { + log::error!("error parsing subtitle at pts {:?}", packet.pts()) + } + } + Ok(false) => { + log::debug!("got empty (?) subtitle, not sure if this should ever happen"); + } + Err(e) => { + log::error!("error decoding subtitle: {:?}", e) + } + } + } + + Ok(()) +} + +fn parse_subtitle( + subtitle: &ffmpeg::Subtitle, + packet: &ffmpeg::Packet, + time_base: Rational, +) -> Option<SubtitleCue> { + let pts_to_clock_time = |pts: i64| { + let nseconds: i64 = + (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64; + gst::ClockTime::from_nseconds(nseconds as u64) + }; + + let text = subtitle + .rects() + .into_iter() + .map(|rect| match rect { + ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(), + ffmpeg::subtitle::Rect::Ass(ass) => { + extract_dialogue_text(ass.get()).unwrap_or(String::new()) + } + _ => String::new(), + }) + .collect::<Vec<String>>() + .join("\n— "); + + let start_time = pts_to_clock_time(packet.pts()?); + let end_time = pts_to_clock_time(packet.pts()? + packet.duration()); + + Some(SubtitleCue { + text, + start_time, + end_time, + }) +} + +fn extract_dialogue_text(dialogue_line: &str) -> Option<String> { + // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text + // we need the 9th field (Text), so split on comma but only take first 9 splits + // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433 + let text = dialogue_line.splitn(9, ',').last()?; + + // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc. + let mut result = String::new(); + let mut in_tag = false; + let mut char_iter = text.chars().peekable(); + + while let Some(c) = char_iter.next() { + if c == '{' && char_iter.peek() == Some(&'\\') { + in_tag = true; + } else if c == '}' { + in_tag = false; + } else if !in_tag { + // process line breaks and hard spaces + if c == '\\' { + match char_iter.peek() { + Some(&'N') => { + char_iter.next(); + result.push('\n'); + } + Some(&'n') | Some(&'h') => { + char_iter.next(); + result.push(' '); + } + _ => result.push(c), + } + } else { + result.push(c); + } + } + } + + Some(result) +} diff --git a/src/subtitles/extraction/mod.rs b/src/subtitles/extraction/mod.rs new file mode 100644 index 0000000..b012658 --- /dev/null +++ b/src/subtitles/extraction/mod.rs @@ -0,0 +1,153 @@ +/// Extraction of embedded subtitles +mod embedded; +/// Synthesis of subtitles from audio using whisper.cpp +mod whisper; + +use std::{collections::BTreeMap, sync::mpsc, thread}; + +use ffmpeg::Rational; +use relm4::{ComponentSender, Worker}; + +use crate::subtitles::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue, SubtitleTrack, TrackMetadata}; + +pub struct SubtitleExtractor {} + +#[derive(Debug)] +pub enum SubtitleExtractorMsg { + ExtractFromUrl { + url: String, + // the index of the audio stream on which to run a whisper transcription + whisper_stream_index: Option<usize>, + }, +} + +#[derive(Debug)] +pub enum SubtitleExtractorOutput { + NewCue(StreamIndex, SubtitleCue), + ExtractionComplete, +} + +impl Worker for SubtitleExtractor { + type Init = (); + type Input = SubtitleExtractorMsg; + type Output = SubtitleExtractorOutput; + + fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self { + Self {} + } + + fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) { + match msg { + SubtitleExtractorMsg::ExtractFromUrl { + url, + whisper_stream_index: whisper_audio_stream_ix, + } => { + self.handle_extract_from_url(url, whisper_audio_stream_ix, sender); + } + } + } +} + +impl SubtitleExtractor { + fn handle_extract_from_url( + &mut self, + url: String, + whisper_audio_stream_ix: Option<usize>, + sender: ComponentSender<Self>, + ) { + // Clear existing tracks + SUBTITLE_TRACKS.write().clear(); + + match self.extract_subtitles(&url, whisper_audio_stream_ix, sender.clone()) { + Ok(_) => { + log::info!("Subtitle extraction completed successfully"); + sender + .output(SubtitleExtractorOutput::ExtractionComplete) + .unwrap(); + } + Err(e) => { + log::error!("Subtitle extraction failed: {}", e); + } + } + } + + fn extract_subtitles( + &self, + url: &str, + whisper_audio_stream_ix: Option<usize>, + sender: ComponentSender<Self>, + ) -> anyhow::Result<()> { + let mut input = ffmpeg::format::input(&url)?; + + let mut subtitle_extractors = BTreeMap::new(); + + // create extractor for each subtitle stream + for stream in input.streams() { + let stream_ix = stream.index(); + + if stream.parameters().medium() == ffmpeg::media::Type::Subtitle { + let metadata = TrackMetadata::from_ffmpeg_stream(&stream); + let track = SubtitleTrack::new(metadata); + + SUBTITLE_TRACKS.write().insert(stream_ix, track); + + let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?; + let (packet_tx, packet_rx) = mpsc::channel(); + let time_base = stream.time_base(); + let sender = sender.clone(); + let join_handle = thread::spawn(move || { + embedded::extract_embedded_subtitles( + stream_ix, context, time_base, packet_rx, sender, + ) + }); + + subtitle_extractors.insert(stream_ix, (packet_tx, join_handle)); + } + } + + if let Some(stream_ix) = whisper_audio_stream_ix { + let stream = input.stream(stream_ix).unwrap(); + + let mut metadata = TrackMetadata::from_ffmpeg_stream(&stream); + metadata.title = Some(match metadata.title { + Some(title) => format!("Auto-generated from audio (Whisper): {}", title), + None => "Auto-generated from audio (Whisper)".to_string(), + }); + + let track = SubtitleTrack::new(metadata); + + SUBTITLE_TRACKS.write().insert(stream_ix, track); + + let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?; + let (packet_tx, packet_rx) = mpsc::channel(); + let time_base = stream.time_base(); + let sender = sender.clone(); + let join_handle = thread::spawn(move || { + whisper::generate_whisper_subtitles( + stream_ix, context, time_base, packet_rx, sender, + ) + }); + + subtitle_extractors.insert(stream_ix, (packet_tx, join_handle)); + } + + // process packets + for (stream, packet) in input.packets() { + let stream_index = stream.index(); + + if let Some((packet_tx, _)) = subtitle_extractors.get_mut(&stream_index) { + packet_tx.send(packet).unwrap(); + } + } + + // wait for extraction to complete + for (_, (_, join_handle)) in subtitle_extractors { + join_handle + .join() + .unwrap() + .unwrap_or_else(|e| log::error!("error running subtitle extraction: {}", e)); + } + + Ok(()) + } +} diff --git a/src/subtitles/extraction/whisper.rs b/src/subtitles/extraction/whisper.rs new file mode 100644 index 0000000..bd6fba7 --- /dev/null +++ b/src/subtitles/extraction/whisper.rs @@ -0,0 +1,139 @@ +use std::{ + io::{self, BufRead, BufReader}, + net::{TcpListener, TcpStream}, + sync::mpsc, +}; + +use anyhow::Context; +use ffmpeg::{filter, frame}; +use serde::Deserialize; + +use crate::{ + subtitles::extraction::*, + subtitles::{StreamIndex, SubtitleCue}, +}; + +#[derive(Debug, Deserialize)] +struct WhisperCue { + start: u64, + end: u64, + text: String, +} + +pub fn generate_whisper_subtitles( + // stream index to use when storing generated subtitles, this index + // already has to be in TRACKS when this function is called! + stream_ix: StreamIndex, + context: ffmpeg::codec::Context, + time_base: ffmpeg::Rational, + packet_rx: mpsc::Receiver<ffmpeg::Packet>, + sender: ComponentSender<SubtitleExtractor>, +) -> anyhow::Result<()> { + // FFmpeg's whisper filter will send the generated subtitles to us as JSON + // objects over a TCP socket. This is the best solution I could find + // because we need to use one of the protocols in + // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the + // list which is portable and supports non-blocking IO in Rust. + let tcp_listener = TcpListener::bind("127.0.0.1:0")?; + + let mut decoder = context + .decoder() + .audio() + .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?; + + let mut filter = filter::Graph::new(); + + let abuffer_args = format!( + "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}", + time_base, + decoder.rate(), + decoder.format().name(), + decoder.channel_layout().bits() + ); + + let whisper_args = format!( + "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json", + "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", + 30, + tcp_listener.local_addr()?.port() + ); + let filter_spec = format!("[src] whisper={} [sink]", whisper_args); + + filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?; + filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?; + filter + .output("src", 0)? + .input("sink", 0)? + .parse(&filter_spec)?; + filter.validate()?; + + let mut source_ctx = filter.get("src").unwrap(); + let mut sink_ctx = filter.get("sink").unwrap(); + + let (tcp_stream, _) = tcp_listener.accept()?; + tcp_stream.set_nonblocking(true)?; + + let mut transcript_reader = BufReader::new(tcp_stream); + let mut line_buf = String::new(); + + while let Ok(packet) = packet_rx.recv() { + handle_packet( + stream_ix, + &sender, + &mut decoder, + source_ctx.source(), + sink_ctx.sink(), + &mut transcript_reader, + &mut line_buf, + packet, + ) + .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e)) + } + + Ok(()) +} + +// TODO: can we do this without passing all the arguments? this is kinda ugly +fn handle_packet( + stream_ix: StreamIndex, + sender: &ComponentSender<SubtitleExtractor>, + decoder: &mut ffmpeg::decoder::Audio, + mut source: filter::Source, + mut sink: filter::Sink, + transcript_reader: &mut BufReader<TcpStream>, + line_buf: &mut String, + packet: ffmpeg::Packet, +) -> anyhow::Result<()> { + decoder.send_packet(&packet)?; + + let mut decoded = frame::Audio::empty(); + while decoder.receive_frame(&mut decoded).is_ok() { + source.add(&decoded)?; + } + + let mut out_frame = frame::Audio::empty(); + while sink.frame(&mut out_frame).is_ok() {} + + line_buf.clear(); + match transcript_reader.read_line(line_buf) { + Ok(_) => { + let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?; + + let cue = SubtitleCue { + text: whisper_cue.text, + start_time: gst::ClockTime::from_mseconds(whisper_cue.start), + end_time: gst::ClockTime::from_mseconds(whisper_cue.end), + }; + + sender + .output(SubtitleExtractorOutput::NewCue(stream_ix, cue)) + .unwrap(); + + Ok(()) + } + Err(e) => match e.kind() { + io::ErrorKind::WouldBlock => Ok(()), + _ => Err(e)?, + }, + } +} |