Skip to main content

Speech Recognition

Given speech data recorded from an microphone the Speech Recognition returns the corresponding transcript of the spoken text. The steps below show how to get the English version of DeepSpeech up and running.

Install

sudo apt install portaudio19-dev
pip3 install pyaudio
pip3 install deepspeech-gpu
  • Model:
curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

Usage

main.py
#!/usr/env python3
import time

import deepspeech
import numpy as np
import pyaudio

model_file = './deepspeech-0.9.3-models.pbmm'
model = deepspeech.Model(model_file)
model.setScorerAlphaBeta(0.75, 1.85)
model.setBeamWidth(500)
model.enableExternalScorer('./deepspeech-0.9.3-models.scorer')

stream = model.createStream()

prev_text = ''
def process_audio(in_data, frame_count, time_info, status):
global prev_text
data16 = np.frombuffer(in_data, dtype=np.int16)
stream.feedAudioContent(data16)
text = stream.intermediateDecode()
if text != prev_text:
print('\rTranscript = {}'.format(text))
prev_text = text
return (in_data, pyaudio.paContinue)

audio = pyaudio.PyAudio()
audio_stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=1024,
stream_callback=process_audio
)

try:
while audio_stream.is_active():
time.sleep(0.1)
except KeyboardInterrupt:
pass

audio_stream.stop_stream()
audio_stream.close()
audio.terminate()
text = stream.finishStream()
print('Final text = {}'.format(text))

Source