Added DeepSpeech RecognitionService

This commit is contained in:
Ciaran O'Reilly 2020-11-20 22:25:38 +01:00
parent ea2d551bed
commit 920f3468b2
13 changed files with 592 additions and 10 deletions

4
.gitignore vendored
View File

@ -8,4 +8,6 @@ build
.project .project
# JDT-specific (Eclipse Java Development Tools) # JDT-specific (Eclipse Java Development Tools)
.classpath .classpath
.vscode

31
app/assets.xml Normal file
View File

@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<project name="assets">
<property name="assets.list.name" value="assets.lst"/>
<property name="assets.dir" value="src/main/assets/sync"/>
<property name="assets.hash.type" value="md5"/>
<property name="assets.ctl.files"
value="**/*.${assets.hash.type},${assets.list.name}"/>
<fileset id="assets" dir="${assets.dir}" excludes="${assets.ctl.files}"/>
<target name="clean_assets">
<delete>
<fileset dir="${assets.dir}" includes="${assets.ctl.files}"/>
</delete>
</target>
<target name="list">
<pathconvert
dirsep="/" pathsep="${line.separator}"
refid="assets" property="asset.list">
<map from="${basedir}/${assets.dir}/" to=""/>
</pathconvert>
<echo message="${asset.list}" file="${assets.dir}/${assets.list.name}"/>
</target>
<target name="checksum">
<checksum algorithm="${assets.hash.type}">
<fileset refid="assets"/>
</checksum>
</target>
</project>

View File

@ -5,6 +5,9 @@ repositories {
maven { maven {
url "https://dl.bintray.com/alphacep/vosk" url "https://dl.bintray.com/alphacep/vosk"
} }
maven {
url "https://jitpack.io"
}
} }
android { android {
@ -31,4 +34,10 @@ dependencies {
implementation 'com.alphacep:vosk-android:0.3.15' implementation 'com.alphacep:vosk-android:0.3.15'
implementation 'androidx.appcompat:appcompat:1.2.0' implementation 'androidx.appcompat:appcompat:1.2.0'
implementation 'com.google.code.gson:gson:2.8.6' implementation 'com.google.code.gson:gson:2.8.6'
} implementation 'org.mozilla.deepspeech:libdeepspeech:0.8.2'
implementation 'com.github.gkonovalov:android-vad:1.0.0'
}
ant.importBuild 'assets.xml'
preBuild.dependsOn(list, checksum)
clean.dependsOn(clean_assets)

View File

@ -27,5 +27,23 @@
android:name="android.speech" android:name="android.speech"
android:resource="@xml/recognition_service" /> android:resource="@xml/recognition_service" />
</service> </service>
<service
android:name=".DeepSpeechRecognitionService"
android:icon="@drawable/ic_service_trigger"
android:label="@string/deepspeech_recognition_service"
android:permission="android.permission.RECORD_AUDIO">
<intent-filter>
<!-- The constant value is defined at RecognitionService.SERVICE_INTERFACE. -->
<action android:name="android.speech.RecognitionService" />
<category android:name="android.intent.category.DEFAULT" />
</intent-filter>
<meta-data
android:name="android.speech"
android:resource="@xml/recognition_service" />
</service>
</application> </application>
</manifest> </manifest>

View File

@ -0,0 +1,26 @@
deepspeech-catala/kenlm.scorer
deepspeech-catala/model.tflite
vosk-catala/README
vosk-catala/am/final.mdl
vosk-catala/am/tree
vosk-catala/conf/mfcc.conf
vosk-catala/conf/model.conf
vosk-catala/graph/Gr.fst
vosk-catala/graph/HCLr.fst
vosk-catala/graph/disambig_tid.int
vosk-catala/graph/phones/align_lexicon.int
vosk-catala/graph/phones/align_lexicon.txt
vosk-catala/graph/phones/disambig.int
vosk-catala/graph/phones/disambig.txt
vosk-catala/graph/phones/optional_silence.csl
vosk-catala/graph/phones/optional_silence.int
vosk-catala/graph/phones/optional_silence.txt
vosk-catala/graph/phones/silence.csl
vosk-catala/graph/phones/word_boundary.int
vosk-catala/graph/phones/word_boundary.txt
vosk-catala/ivector/final.dubm
vosk-catala/ivector/final.ie
vosk-catala/ivector/final.mat
vosk-catala/ivector/global_cmvn.stats
vosk-catala/ivector/online_cmvn.conf
vosk-catala/ivector/splice.conf

View File

@ -0,0 +1 @@
d562825f02f2ba36cbd0a75a17e84e8d

View File

@ -0,0 +1 @@
735b1327dc3c00af256af64be33cbed3

View File

@ -0,0 +1 @@
f49442fa8c9e15bfbb6379c788b3104f

View File

@ -0,0 +1,176 @@
package cat.oreilly.localstt;
import android.content.Intent;
import android.os.AsyncTask;
import android.os.Bundle;
import android.os.RemoteException;
import android.speech.RecognitionService;
import android.util.Log;
import android.media.AudioFormat;
import android.media.AudioRecord;
import android.media.MediaRecorder;
import org.kaldi.Assets;
import org.kaldi.RecognitionListener;
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
import java.io.File;
import java.util.Map;
import java.util.ArrayList;
import java.io.IOException;
public class DeepSpeechRecognitionService extends RecognitionService implements RecognitionListener {
private final static String TAG = DeepSpeechRecognitionService.class.getSimpleName();
private DeepSpeechModel model;
private DeepSpeechService speechService;
private RecognitionService.Callback mCallback;
@Override
protected void onStartListening(Intent intent, Callback callback) {
mCallback = callback;
Log.i(TAG, "onStartListening");
runRecognizerSetup(intent);
}
@Override
protected void onCancel(Callback callback) {
Log.i(TAG, "onCancel");
results(new Bundle(), true);
}
@Override
protected void onStopListening(Callback callback) {
Log.i(TAG, "onStopListening");
results(new Bundle(), true);
}
private void runRecognizerSetup(final Intent intent) {
new AsyncTask<Void, Void, Exception>() {
@Override
protected Exception doInBackground(Void... params) {
try {
Assets assets = new Assets(DeepSpeechRecognitionService.this);
File assetDir = assets.syncAssets();
model = new DeepSpeechModel(assetDir.toString() + "/deepspeech-catala/model.tflite");
model.enableExternalScorer(assetDir.toString() + "/deepspeech-catala/kenlm.scorer");
setupRecognizer();
} catch (IOException e) {
return e;
}
return null;
}
@Override
protected void onPostExecute(Exception result) {
if (result != null) {
Log.e(TAG, "Failed to init recognizer " + result);
error(android.speech.SpeechRecognizer.ERROR_CLIENT);
} else {
readyForSpeech(new Bundle());
beginningOfSpeech();
}
}
}.execute();
}
@Override
public void onDestroy() {
super.onDestroy();
if (speechService != null) {
speechService.cancel();
speechService.shutdown();
}
}
private void setupRecognizer() throws IOException {
try {
Log.i(TAG, "Setting up recognizer");
DeepSpeechService speechService = new DeepSpeechService(this.model, 16000.0f);
speechService.addListener(this);
speechService.startListening();
} catch (IOException e) {
Log.e(TAG, e.getMessage());
}
}
private void readyForSpeech(Bundle bundle) {
try {
mCallback.readyForSpeech(bundle);
} catch (RemoteException e) {
// empty
}
}
private void results(Bundle bundle, boolean isFinal) {
if (speechService != null) {
speechService.cancel();
}
try {
if (isFinal) {
mCallback.results(bundle);
} else {
mCallback.partialResults(bundle);
}
} catch (RemoteException e) {
// empty
}
}
private Bundle createResultsBundle(String hypothesis) {
ArrayList<String> hypotheses = new ArrayList<>();
hypotheses.add(hypothesis);
Bundle bundle = new Bundle();
bundle.putStringArrayList(android.speech.SpeechRecognizer.RESULTS_RECOGNITION, hypotheses);
return bundle;
}
private void beginningOfSpeech() {
try {
mCallback.beginningOfSpeech();
} catch (RemoteException e) {
// empty
}
}
private void error(int errorCode) {
speechService.cancel();
try {
mCallback.error(errorCode);
} catch (RemoteException e) {
// empty
}
}
@Override
public void onResult(String hypothesis) {
if (hypothesis != null) {
Log.i(TAG, hypothesis);
results(createResultsBundle(hypothesis), true);
}
}
@Override
public void onPartialResult(String hypothesis) {
if (hypothesis != null) {
Log.i(TAG, hypothesis);
results(createResultsBundle(hypothesis), false);
}
}
@Override
public void onError(Exception e) {
Log.e(TAG, e.getMessage());
error(android.speech.SpeechRecognizer.ERROR_CLIENT);
}
@Override
public void onTimeout() {
speechService.cancel();
speechService.startListening();
}
}

View File

@ -0,0 +1,313 @@
// Copyright 2019 Alpha Cephei Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cat.oreilly.localstt;
import static java.lang.String.format;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import android.media.AudioFormat;
import android.media.AudioRecord;
import android.media.MediaRecorder.AudioSource;
import android.os.Handler;
import android.os.Looper;
import android.util.Log;
import org.kaldi.RecognitionListener;
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
import org.mozilla.deepspeech.libdeepspeech.DeepSpeechStreamingState;
import com.konovalov.vad.Vad;
import com.konovalov.vad.VadConfig;
/**
* Service that records audio in a thread, passes it to a recognizer and emits
* recognition results. Recognition events are passed to a client using
* {@link RecognitionListener}
*
*/
public class DeepSpeechService {
protected static final String TAG = DeepSpeechService.class.getSimpleName();
private final DeepSpeechModel model;
private final DeepSpeechStreamingState streamContext;
private final Vad vad;
private final int sampleRate;
private final static float BUFFER_SIZE_SECONDS = 0.4f;
private int bufferSize;
private final AudioRecord recorder;
private Thread recognizerThread;
private final Handler mainHandler = new Handler(Looper.getMainLooper());
private final Collection<RecognitionListener> listeners = new HashSet<RecognitionListener>();
/**
* Creates speech service. Service holds the AudioRecord object, so you need to
* call {@link release} in order to properly finalize it.
*
* @throws IOException thrown if audio recorder can not be created for some
* reason.
*/
public DeepSpeechService(DeepSpeechModel model, float sampleRate) throws IOException {
this.model = model;
this.sampleRate = (int) sampleRate;
this.streamContext = model.createStream();
vad = new Vad(VadConfig.newBuilder().setSampleRate(VadConfig.SampleRate.SAMPLE_RATE_16K)
.setFrameSize(VadConfig.FrameSize.FRAME_SIZE_480).setMode(VadConfig.Mode.NORMAL).build());
bufferSize = Math.round(this.sampleRate * BUFFER_SIZE_SECONDS);
recorder = new AudioRecord(AudioSource.VOICE_RECOGNITION, this.sampleRate, AudioFormat.CHANNEL_IN_MONO,
AudioFormat.ENCODING_PCM_16BIT, bufferSize * 2);
if (recorder.getState() == AudioRecord.STATE_UNINITIALIZED) {
recorder.release();
throw new IOException("Failed to initialize recorder. Microphone might be already in use.");
}
Log.i(TAG, "DeepSpeechService initialized");
}
/**
* Adds listener.
*/
public void addListener(RecognitionListener listener) {
synchronized (listeners) {
listeners.add(listener);
}
}
/**
* Removes listener.
*/
public void removeListener(RecognitionListener listener) {
synchronized (listeners) {
listeners.remove(listener);
}
}
/**
* Starts recognition. Does nothing if recognition is active.
*
* @return true if recognition was actually started
*/
public boolean startListening() {
if (null != recognizerThread)
return false;
recognizerThread = new RecognizerThread();
recognizerThread.start();
return true;
}
/**
* Starts recognition. After specified timeout listening stops and the
* endOfSpeech signals about that. Does nothing if recognition is active.
*
* @timeout - timeout in milliseconds to listen.
*
* @return true if recognition was actually started
*/
public boolean startListening(int timeout) {
if (null != recognizerThread)
return false;
recognizerThread = new RecognizerThread(timeout);
recognizerThread.start();
return true;
}
private boolean stopRecognizerThread() {
if (null == recognizerThread)
return false;
try {
recognizerThread.interrupt();
recognizerThread.join();
} catch (InterruptedException e) {
// Restore the interrupted status.
Thread.currentThread().interrupt();
}
recognizerThread = null;
return true;
}
/**
* Stops recognition. All listeners should receive final result if there is any.
* Does nothing if recognition is not active.
*
* @return true if recognition was actually stopped
*/
public boolean stop() {
boolean result = stopRecognizerThread();
if (result) {
mainHandler.post(new ResultEvent(model.finishStream(streamContext), true));
}
return result;
}
/**
* Cancels recognition. Listeners do not receive final result. Does nothing if
* recognition is not active.
*
* @return true if recognition was actually canceled
*/
public boolean cancel() {
Log.d(TAG, "#cancel");
boolean result = stopRecognizerThread();
this.model.freeModel(); // Reset recognizer state
return result;
}
/**
* Shutdown the recognizer and release the recorder
*/
public void shutdown() {
Log.d(TAG, "#shutdown");
this.model.freeModel();
recorder.release();
}
private final class RecognizerThread extends Thread {
private int remainingSamples;
private int timeoutSamples;
private final static int NO_TIMEOUT = -1;
public RecognizerThread(int timeout) {
if (timeout != NO_TIMEOUT)
this.timeoutSamples = timeout * sampleRate / 1000;
else
this.timeoutSamples = NO_TIMEOUT;
this.remainingSamples = this.timeoutSamples;
}
public RecognizerThread() {
this(NO_TIMEOUT);
}
@Override
public void run() {
Log.i(TAG, "Start Recording...");
vad.start();
recorder.startRecording();
if (recorder.getRecordingState() == AudioRecord.RECORDSTATE_STOPPED) {
recorder.stop();
IOException ioe = new IOException("Failed to start recording. Microphone might be already in use.");
mainHandler.post(new OnErrorEvent(ioe));
return;
}
short[] buffer = new short[bufferSize];
int nread = recorder.read(buffer, 0, buffer.length);
boolean speechDetected = false;
boolean feedAudio = true;
while (!interrupted() && ((timeoutSamples == NO_TIMEOUT) || (remainingSamples > 0)) && feedAudio) {
if (nread < 0) {
throw new RuntimeException("error reading audio buffer");
} else {
Log.i(TAG, "Feeding audio");
model.feedAudioContent(streamContext, buffer, nread);
boolean isSpeech = vad.isSpeech(buffer);
if (isSpeech) {
Log.d(TAG, "Speech detected");
speechDetected = true;
}
if (speechDetected && !isSpeech) {
Log.d(TAG, "Silence detected");
feedAudio = false;
}
}
if (timeoutSamples != NO_TIMEOUT) {
remainingSamples = remainingSamples - nread;
}
nread = recorder.read(buffer, 0, buffer.length);
}
mainHandler.post(new ResultEvent(model.finishStream(streamContext), true));
recorder.stop();
vad.stop();
// Remove all pending notifications.
mainHandler.removeCallbacksAndMessages(null);
// If we met timeout signal that speech ended
if (timeoutSamples != NO_TIMEOUT && remainingSamples <= 0) {
mainHandler.post(new TimeoutEvent());
}
}
}
private abstract class RecognitionEvent implements Runnable {
public void run() {
RecognitionListener[] emptyArray = new RecognitionListener[0];
for (RecognitionListener listener : listeners.toArray(emptyArray))
execute(listener);
}
protected abstract void execute(RecognitionListener listener);
}
private class ResultEvent extends RecognitionEvent {
protected final String hypothesis;
private final boolean finalResult;
ResultEvent(String hypothesis, boolean finalResult) {
this.hypothesis = hypothesis;
this.finalResult = finalResult;
}
@Override
protected void execute(RecognitionListener listener) {
if (finalResult)
listener.onResult(hypothesis);
else
listener.onPartialResult(hypothesis);
}
}
private class OnErrorEvent extends RecognitionEvent {
private final Exception exception;
OnErrorEvent(Exception exception) {
this.exception = exception;
}
@Override
protected void execute(RecognitionListener listener) {
listener.onError(exception);
}
}
private class TimeoutEvent extends RecognitionEvent {
@Override
protected void execute(RecognitionListener listener) {
listener.onTimeout();
}
}
}

View File

@ -21,7 +21,7 @@ import java.util.ArrayList;
import java.io.IOException; import java.io.IOException;
public class VoskRecognitionService extends RecognitionService implements RecognitionListener { public class VoskRecognitionService extends RecognitionService implements RecognitionListener {
private final static String TAG = VoskRecognitionService.class.getName(); private final static String TAG = VoskRecognitionService.class.getSimpleName();
private KaldiRecognizer recognizer; private KaldiRecognizer recognizer;
private SpeechService speechService; private SpeechService speechService;
private Model model; private Model model;
@ -38,13 +38,13 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
@Override @Override
protected void onCancel(Callback callback) { protected void onCancel(Callback callback) {
Log.i(TAG, "onCancel"); Log.i(TAG, "onCancel");
results(new Bundle()); results(new Bundle(), true);
} }
@Override @Override
protected void onStopListening(Callback callback) { protected void onStopListening(Callback callback) {
Log.i(TAG, "onStopListening"); Log.i(TAG, "onStopListening");
results(new Bundle()); results(new Bundle(), true);
} }
private void runRecognizerSetup(final Intent intent) { private void runRecognizerSetup(final Intent intent) {
@ -108,10 +108,14 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
} }
} }
private void results(Bundle bundle) { private void results(Bundle bundle, boolean isFinal) {
speechService.cancel();
try { try {
mCallback.results(bundle); if (isFinal) {
speechService.cancel();
mCallback.results(bundle);
} else {
mCallback.partialResults(bundle);
}
} catch (RemoteException e) { } catch (RemoteException e) {
// empty // empty
} }
@ -149,7 +153,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
Gson gson = new Gson(); Gson gson = new Gson();
Map<String, String> map = gson.fromJson(hypothesis, Map.class); Map<String, String> map = gson.fromJson(hypothesis, Map.class);
String text = map.get("text"); String text = map.get("text");
results(createResultsBundle(text)); results(createResultsBundle(text), true);
} }
} }
@ -160,7 +164,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
Gson gson = new Gson(); Gson gson = new Gson();
Map<String, String> map = gson.fromJson(hypothesis, Map.class); Map<String, String> map = gson.fromJson(hypothesis, Map.class);
String text = map.get("partial"); String text = map.get("partial");
results(createResultsBundle(text)); results(createResultsBundle(text), false);
} }
} }