summary refs log tree commit diff
path: root/src/subtitle_extraction
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-12-05 15:35:38 +0100
committerMalte Voos <git@mal.tc>2025-12-05 15:43:58 +0100
commitc347b6133365dcf1b7da4e77890b20d04d6cfba4 (patch)
treec83aac6f7d1e6edc57e607f01e5d3eeee8da4a0e /src/subtitle_extraction
parent652b1c2a0ce7db4885ebc51f7f09133a43401442 (diff)
downloadlleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.tar.gz
lleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.zip
implement machine translation; various fixes and refactorings HEAD main
Diffstat (limited to 'src/subtitle_extraction')
-rw-r--r--src/subtitle_extraction/embedded.rs118
-rw-r--r--src/subtitle_extraction/mod.rs159
-rw-r--r--src/subtitle_extraction/whisper.rs143
3 files changed, 0 insertions, 420 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs
deleted file mode 100644
index 0ba6178..0000000
--- a/src/subtitle_extraction/embedded.rs
+++ /dev/null
@@ -1,118 +0,0 @@
-use std::sync::mpsc;
-
-use anyhow::Context;
-
-use crate::subtitle_extraction::*;
-
-pub fn extract_embedded_subtitles(
-    // stream index to use when storing extracted subtitles, this index already
-    // has to be in TRACKS when this function is called!
-    stream_ix: StreamIndex,
-    context: ffmpeg::codec::Context,
-    time_base: ffmpeg::Rational,
-    packet_rx: mpsc::Receiver<ffmpeg::Packet>,
-    sender: ComponentSender<SubtitleExtractor>,
-) -> anyhow::Result<()> {
-    let mut decoder = context
-        .decoder()
-        .subtitle()
-        .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
-
-    while let Ok(packet) = packet_rx.recv() {
-        let mut subtitle = ffmpeg::Subtitle::new();
-        match decoder.decode(&packet, &mut subtitle) {
-            Ok(true) => {
-                if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) {
-                    SUBTITLE_TRACKS
-                        .write()
-                        .get_mut(&stream_ix)
-                        .unwrap()
-                        .cues
-                        .push(cue.clone());
-                    sender
-                        .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
-                        .unwrap();
-                } else {
-                    log::error!("error parsing subtitle at pts {:?}", packet.pts())
-                }
-            }
-            Ok(false) => {
-                log::debug!("got empty (?) subtitle, not sure if this should ever happen");
-            }
-            Err(e) => {
-                log::error!("error decoding subtitle: {:?}", e)
-            }
-        }
-    }
-
-    Ok(())
-}
-
-fn parse_subtitle(
-    subtitle: &ffmpeg::Subtitle,
-    packet: &ffmpeg::Packet,
-    time_base: Rational,
-) -> Option<SubtitleCue> {
-    let pts_to_clock_time = |pts: i64| {
-        let nseconds: i64 =
-            (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
-        gst::ClockTime::from_nseconds(nseconds as u64)
-    };
-
-    let text = subtitle
-        .rects()
-        .into_iter()
-        .map(|rect| match rect {
-            ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
-            ffmpeg::subtitle::Rect::Ass(ass) => {
-                extract_dialogue_text(ass.get()).unwrap_or(String::new())
-            }
-            _ => String::new(),
-        })
-        .collect::<Vec<String>>()
-        .join("\n— ");
-
-    let start = pts_to_clock_time(packet.pts()?);
-    let end = pts_to_clock_time(packet.pts()? + packet.duration());
-
-    Some(SubtitleCue { start, end, text })
-}
-
-fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
-    // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
-    // we need the 9th field (Text), so split on comma but only take first 9 splits
-    // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
-    let text = dialogue_line.splitn(9, ',').last()?;
-
-    // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
-    let mut result = String::new();
-    let mut in_tag = false;
-    let mut char_iter = text.chars().peekable();
-
-    while let Some(c) = char_iter.next() {
-        if c == '{' && char_iter.peek() == Some(&'\\') {
-            in_tag = true;
-        } else if c == '}' {
-            in_tag = false;
-        } else if !in_tag {
-            // process line breaks and hard spaces
-            if c == '\\' {
-                match char_iter.peek() {
-                    Some(&'N') => {
-                        char_iter.next();
-                        result.push('\n');
-                    }
-                    Some(&'n') | Some(&'h') => {
-                        char_iter.next();
-                        result.push(' ');
-                    }
-                    _ => result.push(c),
-                }
-            } else {
-                result.push(c);
-            }
-        }
-    }
-
-    Some(result)
-}
diff --git a/src/subtitle_extraction/mod.rs b/src/subtitle_extraction/mod.rs
deleted file mode 100644
index 9e7fff4..0000000
--- a/src/subtitle_extraction/mod.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-/// Extraction of embedded subtitles
-mod embedded;
-/// Synthesis of subtitles from audio using whisper.cpp
-mod whisper;
-
-use std::{collections::BTreeMap, sync::mpsc, thread};
-
-use ffmpeg::Rational;
-use relm4::{ComponentSender, Worker};
-
-use crate::tracks::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue, SubtitleTrack, TrackMetadata};
-
-pub struct SubtitleExtractor {}
-
-#[derive(Debug)]
-pub enum SubtitleExtractorMsg {
-    ExtractFromUrl {
-        url: String,
-        // the index of the audio stream on which to run a whisper transcription
-        whisper_stream_index: Option<usize>,
-    },
-}
-
-#[derive(Debug)]
-pub enum SubtitleExtractorOutput {
-    NewCue(StreamIndex, SubtitleCue),
-    ExtractionComplete,
-}
-
-impl Worker for SubtitleExtractor {
-    type Init = ();
-    type Input = SubtitleExtractorMsg;
-    type Output = SubtitleExtractorOutput;
-
-    fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self {
-        Self {}
-    }
-
-    fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) {
-        match msg {
-            SubtitleExtractorMsg::ExtractFromUrl {
-                url,
-                whisper_stream_index: whisper_audio_stream_ix,
-            } => {
-                self.handle_extract_from_url(url, whisper_audio_stream_ix, sender);
-            }
-        }
-    }
-}
-
-impl SubtitleExtractor {
-    fn handle_extract_from_url(
-        &mut self,
-        url: String,
-        whisper_audio_stream_ix: Option<usize>,
-        sender: ComponentSender<Self>,
-    ) {
-        // Clear existing tracks
-        SUBTITLE_TRACKS.write().clear();
-
-        match self.extract_subtitles(&url, whisper_audio_stream_ix, sender.clone()) {
-            Ok(_) => {
-                log::info!("Subtitle extraction completed successfully");
-                sender
-                    .output(SubtitleExtractorOutput::ExtractionComplete)
-                    .unwrap();
-            }
-            Err(e) => {
-                log::error!("Subtitle extraction failed: {}", e);
-            }
-        }
-    }
-
-    fn extract_subtitles(
-        &self,
-        url: &str,
-        whisper_audio_stream_ix: Option<usize>,
-        sender: ComponentSender<Self>,
-    ) -> anyhow::Result<()> {
-        let mut input = ffmpeg::format::input(&url)?;
-
-        let mut subtitle_extractors = BTreeMap::new();
-
-        // create extractor for each subtitle stream
-        for stream in input.streams() {
-            let stream_ix = stream.index();
-
-            if stream.parameters().medium() == ffmpeg::media::Type::Subtitle {
-                let metadata = TrackMetadata::from_ffmpeg_stream(&stream);
-                let track = SubtitleTrack {
-                    metadata,
-                    cues: Vec::new(),
-                };
-
-                SUBTITLE_TRACKS.write().insert(stream_ix, track);
-
-                let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?;
-                let (packet_tx, packet_rx) = mpsc::channel();
-                let time_base = stream.time_base();
-                let sender = sender.clone();
-                let join_handle = thread::spawn(move || {
-                    embedded::extract_embedded_subtitles(
-                        stream_ix, context, time_base, packet_rx, sender,
-                    )
-                });
-
-                subtitle_extractors.insert(stream_ix, (packet_tx, join_handle));
-            }
-        }
-
-        if let Some(stream_ix) = whisper_audio_stream_ix {
-            let stream = input.stream(stream_ix).unwrap();
-
-            let mut metadata = TrackMetadata::from_ffmpeg_stream(&stream);
-            metadata.title = Some(match metadata.title {
-                Some(title) => format!("Auto-generated from audio (Whisper): {}", title),
-                None => "Auto-generated from audio (Whisper)".to_string(),
-            });
-
-            let track = SubtitleTrack {
-                metadata,
-                cues: Vec::new(),
-            };
-
-            SUBTITLE_TRACKS.write().insert(stream_ix, track);
-
-            let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?;
-            let (packet_tx, packet_rx) = mpsc::channel();
-            let time_base = stream.time_base();
-            let sender = sender.clone();
-            let join_handle = thread::spawn(move || {
-                whisper::generate_whisper_subtitles(
-                    stream_ix, context, time_base, packet_rx, sender,
-                )
-            });
-
-            subtitle_extractors.insert(stream_ix, (packet_tx, join_handle));
-        }
-
-        // process packets
-        for (stream, packet) in input.packets() {
-            let stream_index = stream.index();
-
-            if let Some((packet_tx, _)) = subtitle_extractors.get_mut(&stream_index) {
-                packet_tx.send(packet).unwrap();
-            }
-        }
-
-        // wait for extraction to complete
-        for (_, (_, join_handle)) in subtitle_extractors {
-            join_handle
-                .join()
-                .unwrap()
-                .unwrap_or_else(|e| log::error!("error running subtitle extraction: {}", e));
-        }
-
-        Ok(())
-    }
-}
diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs
deleted file mode 100644
index ffa2e47..0000000
--- a/src/subtitle_extraction/whisper.rs
+++ /dev/null
@@ -1,143 +0,0 @@
-use std::{
-    io::{self, BufRead, BufReader},
-    net::{TcpListener, TcpStream},
-    sync::mpsc,
-};
-
-use anyhow::Context;
-use ffmpeg::{filter, frame};
-use serde::Deserialize;
-
-use crate::{subtitle_extraction::*, tracks::StreamIndex};
-
-#[derive(Debug, Deserialize)]
-struct WhisperCue {
-    start: u64,
-    end: u64,
-    text: String,
-}
-
-pub fn generate_whisper_subtitles(
-    // stream index to use when storing generated subtitles, this index
-    // already has to be in TRACKS when this function is called!
-    stream_ix: StreamIndex,
-    context: ffmpeg::codec::Context,
-    time_base: ffmpeg::Rational,
-    packet_rx: mpsc::Receiver<ffmpeg::Packet>,
-    sender: ComponentSender<SubtitleExtractor>,
-) -> anyhow::Result<()> {
-    // FFmpeg's whisper filter will send the generated subtitles to us as JSON
-    // objects over a TCP socket. This is the best solution I could find
-    // because we need to use one of the protocols in
-    // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the
-    // list which is portable and supports non-blocking IO in Rust.
-    let tcp_listener = TcpListener::bind("127.0.0.1:0")?;
-
-    let mut decoder = context
-        .decoder()
-        .audio()
-        .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
-
-    let mut filter = filter::Graph::new();
-
-    let abuffer_args = format!(
-        "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
-        time_base,
-        decoder.rate(),
-        decoder.format().name(),
-        decoder.channel_layout().bits()
-    );
-
-    let whisper_args = format!(
-        "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json",
-        "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
-        30,
-        tcp_listener.local_addr()?.port()
-    );
-    let filter_spec = format!("[src] whisper={} [sink]", whisper_args);
-
-    filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?;
-    filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?;
-    filter
-        .output("src", 0)?
-        .input("sink", 0)?
-        .parse(&filter_spec)?;
-    filter.validate()?;
-
-    let mut source_ctx = filter.get("src").unwrap();
-    let mut sink_ctx = filter.get("sink").unwrap();
-
-    let (tcp_stream, _) = tcp_listener.accept()?;
-    tcp_stream.set_nonblocking(true)?;
-
-    let mut transcript_reader = BufReader::new(tcp_stream);
-    let mut line_buf = String::new();
-
-    while let Ok(packet) = packet_rx.recv() {
-        handle_packet(
-            stream_ix,
-            &sender,
-            &mut decoder,
-            source_ctx.source(),
-            sink_ctx.sink(),
-            &mut transcript_reader,
-            &mut line_buf,
-            packet,
-        )
-        .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
-    }
-
-    Ok(())
-}
-
-// TODO: can we do this without passing all the arguments? this is kinda ugly
-fn handle_packet(
-    stream_ix: StreamIndex,
-    sender: &ComponentSender<SubtitleExtractor>,
-    decoder: &mut ffmpeg::decoder::Audio,
-    mut source: filter::Source,
-    mut sink: filter::Sink,
-    transcript_reader: &mut BufReader<TcpStream>,
-    line_buf: &mut String,
-    packet: ffmpeg::Packet,
-) -> anyhow::Result<()> {
-    decoder.send_packet(&packet)?;
-
-    let mut decoded = frame::Audio::empty();
-    while decoder.receive_frame(&mut decoded).is_ok() {
-        source.add(&decoded)?;
-    }
-
-    let mut out_frame = frame::Audio::empty();
-    while sink.frame(&mut out_frame).is_ok() {}
-
-    line_buf.clear();
-    match transcript_reader.read_line(line_buf) {
-        Ok(_) => {
-            let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?;
-
-            let cue = SubtitleCue {
-                start: gst::ClockTime::from_mseconds(whisper_cue.start),
-                end: gst::ClockTime::from_mseconds(whisper_cue.end),
-                text: whisper_cue.text,
-            };
-
-            // TODO deduplicate this vs. the code in embedded.rs
-            SUBTITLE_TRACKS
-                .write()
-                .get_mut(&stream_ix)
-                .unwrap()
-                .cues
-                .push(cue.clone());
-            sender
-                .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
-                .unwrap();
-
-            Ok(())
-        }
-        Err(e) => match e.kind() {
-            io::ErrorKind::WouldBlock => Ok(()),
-            _ => Err(e)?,
-        },
-    }
-}