import cv2
import matplotlib.pyplot as plt

image = cv2.imread("orcasound.png")

plt.figure(figsize=(10, 4))
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

! ffprobe -v quiet -print_format json -show_format -show_streams https://audio-orcasound-net.s3.amazonaws.com/rpi_orcasound_lab/hls/1763452821/live.m3u8 | jq '{ codec: .streams[0].codec_name, sample_rate: .streams[0].sample_rate, channels: .streams[0].channels }'

{
  "codec": "aac",
  "sample_rate": "48000",
  "channels": 2
}

gst-launch-1.0 -v souphttpsrc location="https://audio-orcasound-net.s3.amazonaws.com/rpi_orcasound_lab/hls/1763452821/live.m3u8" is-live=1 ! hlsdemux ! queue ! decodebin ! audioconvert ! autoaudiosink

import numpy as np
import gi

gi.require_version('Gst', '1.0')
from gi.repository import Gst, GLib

Gst.init(None)

hls_source = "https://audio-orcasound-net.s3.amazonaws.com/rpi_orcasound_lab/hls/1763452821/live.m3u8"

source_pipeline = f'''
souphttpsrc location="{hls_source}" is-live=true \
    ! hlsdemux \
    ! queue \
    ! decodebin \
    ! audioconvert \
    ! audio/x-raw,channels=2 \
    ! deinterleave name=d \
    d.src_0 ! \
    audiobuffersplit output-buffer-duration=1/15 ! \
    appsink name=read emit-signals=true
'''.strip()

loop = GLib.MainLoop.new(None, False)

source_pipeline = Gst.parse_launch(source_pipeline)

audio_source = source_pipeline.get_by_name("read")

def on_audio_sample(sink):
    sample = sink.emit("pull-sample")
    audio_buffer = sample.get_buffer()
    caps = sample.get_caps()
    success, map_info = audio_buffer.map(Gst.MapFlags.READ)

    if not success:
        return Gst.FlowReturn.ERROR

    data = map_info.data

    # tensor is a 1D numpy array of float32 audio samples
    tensor = np.frombuffer(data, dtype=np.float32)

    # TODO: Process the audio data (e.g., generate spectrogram)
    print(f"caps: {caps.to_string()}")
    print(f"tensor: {tensor.shape} {tensor.dtype} {tensor.min()} {tensor.max()}")

    audio_buffer.unmap(map_info)

    return Gst.FlowReturn.OK

def on_source_message(bus, message):
    if message.type == Gst.MessageType.EOS:
        print("End-Of-Stream reached.")
        loop.quit()
    elif message.type == Gst.MessageType.ERROR:
        print(f"GStreamer Source Error: {message.parse_error()}")
        loop.quit()
    elif message.type == Gst.MessageType.WARNING:
        print(f"GStreamer Source Warning: {message.parse_warning()}")

# setup a bus to be able to catch errors and gstreamer messages.
# this will come in handy later for debugging and also triggering
# end-of-stream events.
source_bus = source_pipeline.get_bus()
source_bus.add_signal_watch()
source_bus.connect("message", on_source_message)

# this is where the audio data will be received and we can handle it.
audio_source.connect("new-sample", lambda sink: on_audio_sample(sink))

# it's a live source, so set the pipeline to playing right away
source_pipeline.set_state(Gst.State.PLAYING)

try:
    loop.run()
except KeyboardInterrupt:
    print("Interrupted by user, stopping...")
except Exception:
    print("Failed during processing")
finally:
    source_pipeline.set_state(Gst.State.NULL)

Interrupted by user, stopping...

caps: audio/x-raw, format=(string)F32LE, layout=(string)interleaved, rate=(int)48000, channels=(int)1
tensor: (3200,) float32 -0.009153034538030624 0.008057080209255219

import cv2
import matplotlib.pyplot as plt

image = cv2.imread("spectrogram_test.png")

plt.figure(figsize=(10, 4))
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

# flacenc has to convert F32LE to S16LE so we need to set the caps on the audio sink
sink_pipeline = f'''
appsrc name=video emit-signals=true format=time is-live=true \
appsrc name=audio emit-signals=true format=time is-live=true \
matroskamux name=mux ! filesink location="output.mkv" \
video. \
    ! queue \
    ! videoconvert \
    ! video/x-raw,format=I420 \
    ! x264enc tune=zerolatency speed-preset=ultrafast \
    ! h264parse config-interval=-1 ! mux. \
audio. \
    ! queue \
    ! audioconvert \
    ! audio/x-raw,format=S16LE \
    ! flacenc \
    ! mux.
'''.strip()

sink_pipeline = Gst.parse_launch(sink_pipeline)
video_sink = sink_pipeline.get_by_name("video")
audio_sink = sink_pipeline.get_by_name("audio")

# it's very important that we set the caps for the video and audio sink correctly
# In this example, we know exactly what the video format will be so we can set it ahead of time
# But in a more dynamic application, we will need to utilize a probe to set the caps on the fly
video_caps = Gst.Caps.from_string(f"video/x-raw,format=GRAY8,width=640,height=480,framerate=15/1")
audio_caps = Gst.Caps.from_string("audio/x-raw,format=F32LE,layout=interleaved,rate=48000,channels=2")
video_sink.set_property('caps', video_caps)
audio_sink.set_property('caps', audio_caps)


####
# in the on_audio_sample function, after generating the spectrogram image
###
scale_w = 640 / spectrogram_image_width
scale_h = 480 / spectrogram_image_height
scale = min(scale_w, scale_h)
video_width = int(spectrogram_image_width * scale)
video_height = int(spectrogram_image_height * scale)

resized_spec = cv2.resize(spectrogram_image, (video_width, video_height))
    
# place resized_spec in the center of video_frame
y_offset = (480 - video_height) // 2
x_offset = (640 - video_width) // 2
video_frame[y_offset:y_offset+video_height, x_offset:x_offset+video_width] = resized_spec

# we utilize a global pts variable to keep track of the presentation timestamp
# for writing to the sinks
frame_buffer = Gst.Buffer.new_wrapped(video_frame.tobytes())
frame_buffer.pts = global_pts
frame_buffer.duration = buffer.duration

# the duration of the audio buffer is the same as the video buffer
# so we can reuse it here
audio_buffer = Gst.Buffer.new_wrapped(map_info.data)
audio_buffer.pts = global_pts
audio_buffer.duration = buffer.duration

# update the global pts
global_pts += frame_buffer.duration

video_sink.emit("push-buffer", frame_buffer)
audio_sink.emit("push-buffer", audio_buffer)

from IPython.display import Video

Video('https://jackmead515.github.io/videos/spectrovid.mp4', html_attributes='loop autoplay muted playsinline width="100%"')

Spectrogram Generation with Gstreamer and Torch Audio¶

The Source¶

Gstreamer¶