diff options
Diffstat (limited to 'src/subtitle_extraction')
| -rw-r--r-- | src/subtitle_extraction/embedded.rs | 118 | ||||
| -rw-r--r-- | src/subtitle_extraction/mod.rs | 159 | ||||
| -rw-r--r-- | src/subtitle_extraction/whisper.rs | 143 |
3 files changed, 0 insertions, 420 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs deleted file mode 100644 index 0ba6178..0000000 --- a/src/subtitle_extraction/embedded.rs +++ /dev/null @@ -1,118 +0,0 @@ -use std::sync::mpsc; - -use anyhow::Context; - -use crate::subtitle_extraction::*; - -pub fn extract_embedded_subtitles( - // stream index to use when storing extracted subtitles, this index already - // has to be in TRACKS when this function is called! - stream_ix: StreamIndex, - context: ffmpeg::codec::Context, - time_base: ffmpeg::Rational, - packet_rx: mpsc::Receiver<ffmpeg::Packet>, - sender: ComponentSender<SubtitleExtractor>, -) -> anyhow::Result<()> { - let mut decoder = context - .decoder() - .subtitle() - .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?; - - while let Ok(packet) = packet_rx.recv() { - let mut subtitle = ffmpeg::Subtitle::new(); - match decoder.decode(&packet, &mut subtitle) { - Ok(true) => { - if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) { - SUBTITLE_TRACKS - .write() - .get_mut(&stream_ix) - .unwrap() - .cues - .push(cue.clone()); - sender - .output(SubtitleExtractorOutput::NewCue(stream_ix, cue)) - .unwrap(); - } else { - log::error!("error parsing subtitle at pts {:?}", packet.pts()) - } - } - Ok(false) => { - log::debug!("got empty (?) subtitle, not sure if this should ever happen"); - } - Err(e) => { - log::error!("error decoding subtitle: {:?}", e) - } - } - } - - Ok(()) -} - -fn parse_subtitle( - subtitle: &ffmpeg::Subtitle, - packet: &ffmpeg::Packet, - time_base: Rational, -) -> Option<SubtitleCue> { - let pts_to_clock_time = |pts: i64| { - let nseconds: i64 = - (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64; - gst::ClockTime::from_nseconds(nseconds as u64) - }; - - let text = subtitle - .rects() - .into_iter() - .map(|rect| match rect { - ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(), - ffmpeg::subtitle::Rect::Ass(ass) => { - extract_dialogue_text(ass.get()).unwrap_or(String::new()) - } - _ => String::new(), - }) - .collect::<Vec<String>>() - .join("\n— "); - - let start = pts_to_clock_time(packet.pts()?); - let end = pts_to_clock_time(packet.pts()? + packet.duration()); - - Some(SubtitleCue { start, end, text }) -} - -fn extract_dialogue_text(dialogue_line: &str) -> Option<String> { - // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text - // we need the 9th field (Text), so split on comma but only take first 9 splits - // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433 - let text = dialogue_line.splitn(9, ',').last()?; - - // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc. - let mut result = String::new(); - let mut in_tag = false; - let mut char_iter = text.chars().peekable(); - - while let Some(c) = char_iter.next() { - if c == '{' && char_iter.peek() == Some(&'\\') { - in_tag = true; - } else if c == '}' { - in_tag = false; - } else if !in_tag { - // process line breaks and hard spaces - if c == '\\' { - match char_iter.peek() { - Some(&'N') => { - char_iter.next(); - result.push('\n'); - } - Some(&'n') | Some(&'h') => { - char_iter.next(); - result.push(' '); - } - _ => result.push(c), - } - } else { - result.push(c); - } - } - } - - Some(result) -} diff --git a/src/subtitle_extraction/mod.rs b/src/subtitle_extraction/mod.rs deleted file mode 100644 index 9e7fff4..0000000 --- a/src/subtitle_extraction/mod.rs +++ /dev/null @@ -1,159 +0,0 @@ -/// Extraction of embedded subtitles -mod embedded; -/// Synthesis of subtitles from audio using whisper.cpp -mod whisper; - -use std::{collections::BTreeMap, sync::mpsc, thread}; - -use ffmpeg::Rational; -use relm4::{ComponentSender, Worker}; - -use crate::tracks::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue, SubtitleTrack, TrackMetadata}; - -pub struct SubtitleExtractor {} - -#[derive(Debug)] -pub enum SubtitleExtractorMsg { - ExtractFromUrl { - url: String, - // the index of the audio stream on which to run a whisper transcription - whisper_stream_index: Option<usize>, - }, -} - -#[derive(Debug)] -pub enum SubtitleExtractorOutput { - NewCue(StreamIndex, SubtitleCue), - ExtractionComplete, -} - -impl Worker for SubtitleExtractor { - type Init = (); - type Input = SubtitleExtractorMsg; - type Output = SubtitleExtractorOutput; - - fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self { - Self {} - } - - fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) { - match msg { - SubtitleExtractorMsg::ExtractFromUrl { - url, - whisper_stream_index: whisper_audio_stream_ix, - } => { - self.handle_extract_from_url(url, whisper_audio_stream_ix, sender); - } - } - } -} - -impl SubtitleExtractor { - fn handle_extract_from_url( - &mut self, - url: String, - whisper_audio_stream_ix: Option<usize>, - sender: ComponentSender<Self>, - ) { - // Clear existing tracks - SUBTITLE_TRACKS.write().clear(); - - match self.extract_subtitles(&url, whisper_audio_stream_ix, sender.clone()) { - Ok(_) => { - log::info!("Subtitle extraction completed successfully"); - sender - .output(SubtitleExtractorOutput::ExtractionComplete) - .unwrap(); - } - Err(e) => { - log::error!("Subtitle extraction failed: {}", e); - } - } - } - - fn extract_subtitles( - &self, - url: &str, - whisper_audio_stream_ix: Option<usize>, - sender: ComponentSender<Self>, - ) -> anyhow::Result<()> { - let mut input = ffmpeg::format::input(&url)?; - - let mut subtitle_extractors = BTreeMap::new(); - - // create extractor for each subtitle stream - for stream in input.streams() { - let stream_ix = stream.index(); - - if stream.parameters().medium() == ffmpeg::media::Type::Subtitle { - let metadata = TrackMetadata::from_ffmpeg_stream(&stream); - let track = SubtitleTrack { - metadata, - cues: Vec::new(), - }; - - SUBTITLE_TRACKS.write().insert(stream_ix, track); - - let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?; - let (packet_tx, packet_rx) = mpsc::channel(); - let time_base = stream.time_base(); - let sender = sender.clone(); - let join_handle = thread::spawn(move || { - embedded::extract_embedded_subtitles( - stream_ix, context, time_base, packet_rx, sender, - ) - }); - - subtitle_extractors.insert(stream_ix, (packet_tx, join_handle)); - } - } - - if let Some(stream_ix) = whisper_audio_stream_ix { - let stream = input.stream(stream_ix).unwrap(); - - let mut metadata = TrackMetadata::from_ffmpeg_stream(&stream); - metadata.title = Some(match metadata.title { - Some(title) => format!("Auto-generated from audio (Whisper): {}", title), - None => "Auto-generated from audio (Whisper)".to_string(), - }); - - let track = SubtitleTrack { - metadata, - cues: Vec::new(), - }; - - SUBTITLE_TRACKS.write().insert(stream_ix, track); - - let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?; - let (packet_tx, packet_rx) = mpsc::channel(); - let time_base = stream.time_base(); - let sender = sender.clone(); - let join_handle = thread::spawn(move || { - whisper::generate_whisper_subtitles( - stream_ix, context, time_base, packet_rx, sender, - ) - }); - - subtitle_extractors.insert(stream_ix, (packet_tx, join_handle)); - } - - // process packets - for (stream, packet) in input.packets() { - let stream_index = stream.index(); - - if let Some((packet_tx, _)) = subtitle_extractors.get_mut(&stream_index) { - packet_tx.send(packet).unwrap(); - } - } - - // wait for extraction to complete - for (_, (_, join_handle)) in subtitle_extractors { - join_handle - .join() - .unwrap() - .unwrap_or_else(|e| log::error!("error running subtitle extraction: {}", e)); - } - - Ok(()) - } -} diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs deleted file mode 100644 index ffa2e47..0000000 --- a/src/subtitle_extraction/whisper.rs +++ /dev/null @@ -1,143 +0,0 @@ -use std::{ - io::{self, BufRead, BufReader}, - net::{TcpListener, TcpStream}, - sync::mpsc, -}; - -use anyhow::Context; -use ffmpeg::{filter, frame}; -use serde::Deserialize; - -use crate::{subtitle_extraction::*, tracks::StreamIndex}; - -#[derive(Debug, Deserialize)] -struct WhisperCue { - start: u64, - end: u64, - text: String, -} - -pub fn generate_whisper_subtitles( - // stream index to use when storing generated subtitles, this index - // already has to be in TRACKS when this function is called! - stream_ix: StreamIndex, - context: ffmpeg::codec::Context, - time_base: ffmpeg::Rational, - packet_rx: mpsc::Receiver<ffmpeg::Packet>, - sender: ComponentSender<SubtitleExtractor>, -) -> anyhow::Result<()> { - // FFmpeg's whisper filter will send the generated subtitles to us as JSON - // objects over a TCP socket. This is the best solution I could find - // because we need to use one of the protocols in - // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the - // list which is portable and supports non-blocking IO in Rust. - let tcp_listener = TcpListener::bind("127.0.0.1:0")?; - - let mut decoder = context - .decoder() - .audio() - .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?; - - let mut filter = filter::Graph::new(); - - let abuffer_args = format!( - "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}", - time_base, - decoder.rate(), - decoder.format().name(), - decoder.channel_layout().bits() - ); - - let whisper_args = format!( - "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json", - "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", - 30, - tcp_listener.local_addr()?.port() - ); - let filter_spec = format!("[src] whisper={} [sink]", whisper_args); - - filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?; - filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?; - filter - .output("src", 0)? - .input("sink", 0)? - .parse(&filter_spec)?; - filter.validate()?; - - let mut source_ctx = filter.get("src").unwrap(); - let mut sink_ctx = filter.get("sink").unwrap(); - - let (tcp_stream, _) = tcp_listener.accept()?; - tcp_stream.set_nonblocking(true)?; - - let mut transcript_reader = BufReader::new(tcp_stream); - let mut line_buf = String::new(); - - while let Ok(packet) = packet_rx.recv() { - handle_packet( - stream_ix, - &sender, - &mut decoder, - source_ctx.source(), - sink_ctx.sink(), - &mut transcript_reader, - &mut line_buf, - packet, - ) - .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e)) - } - - Ok(()) -} - -// TODO: can we do this without passing all the arguments? this is kinda ugly -fn handle_packet( - stream_ix: StreamIndex, - sender: &ComponentSender<SubtitleExtractor>, - decoder: &mut ffmpeg::decoder::Audio, - mut source: filter::Source, - mut sink: filter::Sink, - transcript_reader: &mut BufReader<TcpStream>, - line_buf: &mut String, - packet: ffmpeg::Packet, -) -> anyhow::Result<()> { - decoder.send_packet(&packet)?; - - let mut decoded = frame::Audio::empty(); - while decoder.receive_frame(&mut decoded).is_ok() { - source.add(&decoded)?; - } - - let mut out_frame = frame::Audio::empty(); - while sink.frame(&mut out_frame).is_ok() {} - - line_buf.clear(); - match transcript_reader.read_line(line_buf) { - Ok(_) => { - let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?; - - let cue = SubtitleCue { - start: gst::ClockTime::from_mseconds(whisper_cue.start), - end: gst::ClockTime::from_mseconds(whisper_cue.end), - text: whisper_cue.text, - }; - - // TODO deduplicate this vs. the code in embedded.rs - SUBTITLE_TRACKS - .write() - .get_mut(&stream_ix) - .unwrap() - .cues - .push(cue.clone()); - sender - .output(SubtitleExtractorOutput::NewCue(stream_ix, cue)) - .unwrap(); - - Ok(()) - } - Err(e) => match e.kind() { - io::ErrorKind::WouldBlock => Ok(()), - _ => Err(e)?, - }, - } -} |