I have a hour long audio streamed to backend with websockets. I need to start transcribing the audio and give back response in near realtime. atleast prevent users from waiting an hr long before checking transcriptions.
I have broken down the problem to below steps
- Record audio stream
- split stream to 30 second chunks and upload
- convert audio chunks to seperate files and transcribe
- send back transcribed text
The problem i face is when i try to play the second chunk of audio i get Invalid data found when processing input
error. It is unplayable and dosent seem to have any audio information associated with it.
Is it possible to add audio information into the second chunk, since first chunk is plays fine.
Frontend
async startRecording() {
this.isRecording = true;
// Start WebSocket connection
this.websocket = new WebSocket("URL");
this.websocket.onopen = () => console.log("WebSocket connected");
this.websocket.onclose = () => console.log("WebSocket disconnected");
this.websocket.onstop = () => console.log("WebSocket onstop");
this.websocket.onerror = (error) => {
console.error("WebSocket error:", error);
this.stopRecording ();
};
this.websocket.onmessage = (event) => {
console.log(event.data)
};
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
this.mediaStream = stream
this.mediaRecorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
// Send audio chunks in real-time
this.mediaRecorder.ondataavailable = (event) => {
if (this.websocket && this.websocket.readyState === WebSocket.OPEN) {
this.websocket.send(event.data);
}
};
this.mediaRecorder.start(30000); // Capture chunks every 30000ms
},
Backend
async def receive(self, text_data=None, bytes_data=None):
if bytes_data:
audio_filename = f"audio_{datetime.now().strftime('%Y%m%d%H%M%S')}.webm"
self.audio_file_path = os.path.join("audio/", audio_filename)
os.makedirs(os.path.dirname(self.audio_file_path), exist_ok=True)
self.audio_file = open(self.audio_file_path, "wb")
self.audio_file.write(bytes_data)
self.audio_file.close()
My assumption is, the second chunk would be working if we can add audio data into the chunk. ffmpeg, transcribe, pydub all fails to read the file with same error Invalid data found when processing input
pydub conversion
audio = AudioSegment.from_file(self.audio_file_path)
audio.export(mp3_file_path, format="mp3")