diff --git a/.gitignore b/.gitignore
index 8a01b30..b3c33ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,6 @@ build
.project
# JDT-specific (Eclipse Java Development Tools)
-.classpath
\ No newline at end of file
+.classpath
+
+.vscode
\ No newline at end of file
diff --git a/app/assets.xml b/app/assets.xml
new file mode 100644
index 0000000..e4cb418
--- /dev/null
+++ b/app/assets.xml
@@ -0,0 +1,31 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/app/build.gradle b/app/build.gradle
index c47967a..f2222f8 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -5,6 +5,9 @@ repositories {
maven {
url "https://dl.bintray.com/alphacep/vosk"
}
+ maven {
+ url "https://jitpack.io"
+ }
}
android {
@@ -31,4 +34,10 @@ dependencies {
implementation 'com.alphacep:vosk-android:0.3.15'
implementation 'androidx.appcompat:appcompat:1.2.0'
implementation 'com.google.code.gson:gson:2.8.6'
-}
\ No newline at end of file
+ implementation 'org.mozilla.deepspeech:libdeepspeech:0.8.2'
+ implementation 'com.github.gkonovalov:android-vad:1.0.0'
+}
+
+ant.importBuild 'assets.xml'
+preBuild.dependsOn(list, checksum)
+clean.dependsOn(clean_assets)
\ No newline at end of file
diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml
index efdd316..b157c2b 100644
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@@ -27,5 +27,23 @@
android:name="android.speech"
android:resource="@xml/recognition_service" />
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/app/src/main/assets/sync/assets.lst b/app/src/main/assets/sync/assets.lst
new file mode 100644
index 0000000..2e7e59d
--- /dev/null
+++ b/app/src/main/assets/sync/assets.lst
@@ -0,0 +1,26 @@
+deepspeech-catala/kenlm.scorer
+deepspeech-catala/model.tflite
+vosk-catala/README
+vosk-catala/am/final.mdl
+vosk-catala/am/tree
+vosk-catala/conf/mfcc.conf
+vosk-catala/conf/model.conf
+vosk-catala/graph/Gr.fst
+vosk-catala/graph/HCLr.fst
+vosk-catala/graph/disambig_tid.int
+vosk-catala/graph/phones/align_lexicon.int
+vosk-catala/graph/phones/align_lexicon.txt
+vosk-catala/graph/phones/disambig.int
+vosk-catala/graph/phones/disambig.txt
+vosk-catala/graph/phones/optional_silence.csl
+vosk-catala/graph/phones/optional_silence.int
+vosk-catala/graph/phones/optional_silence.txt
+vosk-catala/graph/phones/silence.csl
+vosk-catala/graph/phones/word_boundary.int
+vosk-catala/graph/phones/word_boundary.txt
+vosk-catala/ivector/final.dubm
+vosk-catala/ivector/final.ie
+vosk-catala/ivector/final.mat
+vosk-catala/ivector/global_cmvn.stats
+vosk-catala/ivector/online_cmvn.conf
+vosk-catala/ivector/splice.conf
\ No newline at end of file
diff --git a/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer
new file mode 100644
index 0000000..a0f5989
Binary files /dev/null and b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer differ
diff --git a/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5 b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5
new file mode 100644
index 0000000..c0d9066
--- /dev/null
+++ b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5
@@ -0,0 +1 @@
+d562825f02f2ba36cbd0a75a17e84e8d
diff --git a/app/src/main/assets/sync/deepspeech-catala/model.tflite b/app/src/main/assets/sync/deepspeech-catala/model.tflite
new file mode 100644
index 0000000..9c0cfcb
Binary files /dev/null and b/app/src/main/assets/sync/deepspeech-catala/model.tflite differ
diff --git a/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5 b/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5
new file mode 100644
index 0000000..09c621e
--- /dev/null
+++ b/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5
@@ -0,0 +1 @@
+735b1327dc3c00af256af64be33cbed3
diff --git a/app/src/main/assets/sync/vosk-catala/README.md5 b/app/src/main/assets/sync/vosk-catala/README.md5
new file mode 100644
index 0000000..424ec31
--- /dev/null
+++ b/app/src/main/assets/sync/vosk-catala/README.md5
@@ -0,0 +1 @@
+f49442fa8c9e15bfbb6379c788b3104f
diff --git a/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java b/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java
new file mode 100644
index 0000000..0f248df
--- /dev/null
+++ b/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java
@@ -0,0 +1,176 @@
+package cat.oreilly.localstt;
+
+import android.content.Intent;
+import android.os.AsyncTask;
+import android.os.Bundle;
+import android.os.RemoteException;
+import android.speech.RecognitionService;
+import android.util.Log;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+
+import org.kaldi.Assets;
+import org.kaldi.RecognitionListener;
+import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
+
+import java.io.File;
+import java.util.Map;
+import java.util.ArrayList;
+import java.io.IOException;
+
+public class DeepSpeechRecognitionService extends RecognitionService implements RecognitionListener {
+ private final static String TAG = DeepSpeechRecognitionService.class.getSimpleName();
+ private DeepSpeechModel model;
+ private DeepSpeechService speechService;
+
+ private RecognitionService.Callback mCallback;
+
+ @Override
+ protected void onStartListening(Intent intent, Callback callback) {
+ mCallback = callback;
+ Log.i(TAG, "onStartListening");
+ runRecognizerSetup(intent);
+ }
+
+ @Override
+ protected void onCancel(Callback callback) {
+ Log.i(TAG, "onCancel");
+ results(new Bundle(), true);
+ }
+
+ @Override
+ protected void onStopListening(Callback callback) {
+ Log.i(TAG, "onStopListening");
+ results(new Bundle(), true);
+ }
+
+ private void runRecognizerSetup(final Intent intent) {
+ new AsyncTask() {
+ @Override
+ protected Exception doInBackground(Void... params) {
+ try {
+ Assets assets = new Assets(DeepSpeechRecognitionService.this);
+ File assetDir = assets.syncAssets();
+
+ model = new DeepSpeechModel(assetDir.toString() + "/deepspeech-catala/model.tflite");
+ model.enableExternalScorer(assetDir.toString() + "/deepspeech-catala/kenlm.scorer");
+
+ setupRecognizer();
+ } catch (IOException e) {
+ return e;
+ }
+ return null;
+ }
+
+ @Override
+ protected void onPostExecute(Exception result) {
+ if (result != null) {
+ Log.e(TAG, "Failed to init recognizer " + result);
+ error(android.speech.SpeechRecognizer.ERROR_CLIENT);
+ } else {
+ readyForSpeech(new Bundle());
+ beginningOfSpeech();
+
+ }
+ }
+ }.execute();
+ }
+
+ @Override
+ public void onDestroy() {
+ super.onDestroy();
+
+ if (speechService != null) {
+ speechService.cancel();
+ speechService.shutdown();
+ }
+ }
+
+ private void setupRecognizer() throws IOException {
+ try {
+ Log.i(TAG, "Setting up recognizer");
+ DeepSpeechService speechService = new DeepSpeechService(this.model, 16000.0f);
+ speechService.addListener(this);
+ speechService.startListening();
+ } catch (IOException e) {
+ Log.e(TAG, e.getMessage());
+ }
+ }
+
+ private void readyForSpeech(Bundle bundle) {
+ try {
+ mCallback.readyForSpeech(bundle);
+ } catch (RemoteException e) {
+ // empty
+ }
+ }
+
+ private void results(Bundle bundle, boolean isFinal) {
+ if (speechService != null) {
+ speechService.cancel();
+ }
+ try {
+ if (isFinal) {
+ mCallback.results(bundle);
+ } else {
+ mCallback.partialResults(bundle);
+ }
+ } catch (RemoteException e) {
+ // empty
+ }
+ }
+
+ private Bundle createResultsBundle(String hypothesis) {
+ ArrayList hypotheses = new ArrayList<>();
+ hypotheses.add(hypothesis);
+ Bundle bundle = new Bundle();
+ bundle.putStringArrayList(android.speech.SpeechRecognizer.RESULTS_RECOGNITION, hypotheses);
+ return bundle;
+ }
+
+ private void beginningOfSpeech() {
+ try {
+ mCallback.beginningOfSpeech();
+ } catch (RemoteException e) {
+ // empty
+ }
+ }
+
+ private void error(int errorCode) {
+ speechService.cancel();
+ try {
+ mCallback.error(errorCode);
+ } catch (RemoteException e) {
+ // empty
+ }
+ }
+
+ @Override
+ public void onResult(String hypothesis) {
+ if (hypothesis != null) {
+ Log.i(TAG, hypothesis);
+ results(createResultsBundle(hypothesis), true);
+ }
+ }
+
+ @Override
+ public void onPartialResult(String hypothesis) {
+ if (hypothesis != null) {
+ Log.i(TAG, hypothesis);
+ results(createResultsBundle(hypothesis), false);
+ }
+ }
+
+ @Override
+ public void onError(Exception e) {
+ Log.e(TAG, e.getMessage());
+ error(android.speech.SpeechRecognizer.ERROR_CLIENT);
+ }
+
+ @Override
+ public void onTimeout() {
+ speechService.cancel();
+ speechService.startListening();
+ }
+}
diff --git a/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java b/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java
new file mode 100644
index 0000000..7811d84
--- /dev/null
+++ b/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java
@@ -0,0 +1,313 @@
+// Copyright 2019 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cat.oreilly.localstt;
+
+import static java.lang.String.format;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder.AudioSource;
+import android.os.Handler;
+import android.os.Looper;
+import android.util.Log;
+
+import org.kaldi.RecognitionListener;
+import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel;
+import org.mozilla.deepspeech.libdeepspeech.DeepSpeechStreamingState;
+
+import com.konovalov.vad.Vad;
+import com.konovalov.vad.VadConfig;
+
+/**
+ * Service that records audio in a thread, passes it to a recognizer and emits
+ * recognition results. Recognition events are passed to a client using
+ * {@link RecognitionListener}
+ *
+ */
+public class DeepSpeechService {
+
+ protected static final String TAG = DeepSpeechService.class.getSimpleName();
+
+ private final DeepSpeechModel model;
+ private final DeepSpeechStreamingState streamContext;
+ private final Vad vad;
+
+ private final int sampleRate;
+ private final static float BUFFER_SIZE_SECONDS = 0.4f;
+ private int bufferSize;
+ private final AudioRecord recorder;
+
+ private Thread recognizerThread;
+
+ private final Handler mainHandler = new Handler(Looper.getMainLooper());
+
+ private final Collection listeners = new HashSet();
+
+ /**
+ * Creates speech service. Service holds the AudioRecord object, so you need to
+ * call {@link release} in order to properly finalize it.
+ *
+ * @throws IOException thrown if audio recorder can not be created for some
+ * reason.
+ */
+ public DeepSpeechService(DeepSpeechModel model, float sampleRate) throws IOException {
+ this.model = model;
+ this.sampleRate = (int) sampleRate;
+ this.streamContext = model.createStream();
+
+ vad = new Vad(VadConfig.newBuilder().setSampleRate(VadConfig.SampleRate.SAMPLE_RATE_16K)
+ .setFrameSize(VadConfig.FrameSize.FRAME_SIZE_480).setMode(VadConfig.Mode.NORMAL).build());
+
+ bufferSize = Math.round(this.sampleRate * BUFFER_SIZE_SECONDS);
+ recorder = new AudioRecord(AudioSource.VOICE_RECOGNITION, this.sampleRate, AudioFormat.CHANNEL_IN_MONO,
+ AudioFormat.ENCODING_PCM_16BIT, bufferSize * 2);
+
+ if (recorder.getState() == AudioRecord.STATE_UNINITIALIZED) {
+ recorder.release();
+ throw new IOException("Failed to initialize recorder. Microphone might be already in use.");
+ }
+ Log.i(TAG, "DeepSpeechService initialized");
+ }
+
+ /**
+ * Adds listener.
+ */
+ public void addListener(RecognitionListener listener) {
+ synchronized (listeners) {
+ listeners.add(listener);
+ }
+ }
+
+ /**
+ * Removes listener.
+ */
+ public void removeListener(RecognitionListener listener) {
+ synchronized (listeners) {
+ listeners.remove(listener);
+ }
+ }
+
+ /**
+ * Starts recognition. Does nothing if recognition is active.
+ *
+ * @return true if recognition was actually started
+ */
+ public boolean startListening() {
+ if (null != recognizerThread)
+ return false;
+
+ recognizerThread = new RecognizerThread();
+ recognizerThread.start();
+ return true;
+ }
+
+ /**
+ * Starts recognition. After specified timeout listening stops and the
+ * endOfSpeech signals about that. Does nothing if recognition is active.
+ *
+ * @timeout - timeout in milliseconds to listen.
+ *
+ * @return true if recognition was actually started
+ */
+ public boolean startListening(int timeout) {
+ if (null != recognizerThread)
+ return false;
+
+ recognizerThread = new RecognizerThread(timeout);
+ recognizerThread.start();
+ return true;
+ }
+
+ private boolean stopRecognizerThread() {
+ if (null == recognizerThread)
+ return false;
+
+ try {
+ recognizerThread.interrupt();
+ recognizerThread.join();
+ } catch (InterruptedException e) {
+ // Restore the interrupted status.
+ Thread.currentThread().interrupt();
+ }
+
+ recognizerThread = null;
+ return true;
+ }
+
+ /**
+ * Stops recognition. All listeners should receive final result if there is any.
+ * Does nothing if recognition is not active.
+ *
+ * @return true if recognition was actually stopped
+ */
+ public boolean stop() {
+ boolean result = stopRecognizerThread();
+ if (result) {
+ mainHandler.post(new ResultEvent(model.finishStream(streamContext), true));
+ }
+ return result;
+ }
+
+ /**
+ * Cancels recognition. Listeners do not receive final result. Does nothing if
+ * recognition is not active.
+ *
+ * @return true if recognition was actually canceled
+ */
+ public boolean cancel() {
+ Log.d(TAG, "#cancel");
+ boolean result = stopRecognizerThread();
+ this.model.freeModel(); // Reset recognizer state
+ return result;
+ }
+
+ /**
+ * Shutdown the recognizer and release the recorder
+ */
+ public void shutdown() {
+ Log.d(TAG, "#shutdown");
+ this.model.freeModel();
+ recorder.release();
+ }
+
+ private final class RecognizerThread extends Thread {
+
+ private int remainingSamples;
+ private int timeoutSamples;
+ private final static int NO_TIMEOUT = -1;
+
+ public RecognizerThread(int timeout) {
+ if (timeout != NO_TIMEOUT)
+ this.timeoutSamples = timeout * sampleRate / 1000;
+ else
+ this.timeoutSamples = NO_TIMEOUT;
+ this.remainingSamples = this.timeoutSamples;
+ }
+
+ public RecognizerThread() {
+ this(NO_TIMEOUT);
+ }
+
+ @Override
+ public void run() {
+ Log.i(TAG, "Start Recording...");
+
+ vad.start();
+ recorder.startRecording();
+ if (recorder.getRecordingState() == AudioRecord.RECORDSTATE_STOPPED) {
+ recorder.stop();
+ IOException ioe = new IOException("Failed to start recording. Microphone might be already in use.");
+ mainHandler.post(new OnErrorEvent(ioe));
+ return;
+ }
+
+ short[] buffer = new short[bufferSize];
+ int nread = recorder.read(buffer, 0, buffer.length);
+ boolean speechDetected = false;
+ boolean feedAudio = true;
+
+ while (!interrupted() && ((timeoutSamples == NO_TIMEOUT) || (remainingSamples > 0)) && feedAudio) {
+
+ if (nread < 0) {
+ throw new RuntimeException("error reading audio buffer");
+ } else {
+ Log.i(TAG, "Feeding audio");
+ model.feedAudioContent(streamContext, buffer, nread);
+ boolean isSpeech = vad.isSpeech(buffer);
+ if (isSpeech) {
+ Log.d(TAG, "Speech detected");
+ speechDetected = true;
+ }
+ if (speechDetected && !isSpeech) {
+ Log.d(TAG, "Silence detected");
+ feedAudio = false;
+ }
+
+ }
+
+ if (timeoutSamples != NO_TIMEOUT) {
+ remainingSamples = remainingSamples - nread;
+ }
+ nread = recorder.read(buffer, 0, buffer.length);
+ }
+
+ mainHandler.post(new ResultEvent(model.finishStream(streamContext), true));
+
+ recorder.stop();
+ vad.stop();
+
+ // Remove all pending notifications.
+ mainHandler.removeCallbacksAndMessages(null);
+
+ // If we met timeout signal that speech ended
+ if (timeoutSamples != NO_TIMEOUT && remainingSamples <= 0) {
+ mainHandler.post(new TimeoutEvent());
+ }
+ }
+ }
+
+ private abstract class RecognitionEvent implements Runnable {
+ public void run() {
+ RecognitionListener[] emptyArray = new RecognitionListener[0];
+ for (RecognitionListener listener : listeners.toArray(emptyArray))
+ execute(listener);
+ }
+
+ protected abstract void execute(RecognitionListener listener);
+ }
+
+ private class ResultEvent extends RecognitionEvent {
+ protected final String hypothesis;
+ private final boolean finalResult;
+
+ ResultEvent(String hypothesis, boolean finalResult) {
+ this.hypothesis = hypothesis;
+ this.finalResult = finalResult;
+ }
+
+ @Override
+ protected void execute(RecognitionListener listener) {
+ if (finalResult)
+ listener.onResult(hypothesis);
+ else
+ listener.onPartialResult(hypothesis);
+ }
+ }
+
+ private class OnErrorEvent extends RecognitionEvent {
+ private final Exception exception;
+
+ OnErrorEvent(Exception exception) {
+ this.exception = exception;
+ }
+
+ @Override
+ protected void execute(RecognitionListener listener) {
+ listener.onError(exception);
+ }
+ }
+
+ private class TimeoutEvent extends RecognitionEvent {
+ @Override
+ protected void execute(RecognitionListener listener) {
+ listener.onTimeout();
+ }
+ }
+}
diff --git a/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java b/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java
index 1fac1f4..9860d11 100644
--- a/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java
+++ b/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java
@@ -21,7 +21,7 @@ import java.util.ArrayList;
import java.io.IOException;
public class VoskRecognitionService extends RecognitionService implements RecognitionListener {
- private final static String TAG = VoskRecognitionService.class.getName();
+ private final static String TAG = VoskRecognitionService.class.getSimpleName();
private KaldiRecognizer recognizer;
private SpeechService speechService;
private Model model;
@@ -38,13 +38,13 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
@Override
protected void onCancel(Callback callback) {
Log.i(TAG, "onCancel");
- results(new Bundle());
+ results(new Bundle(), true);
}
@Override
protected void onStopListening(Callback callback) {
Log.i(TAG, "onStopListening");
- results(new Bundle());
+ results(new Bundle(), true);
}
private void runRecognizerSetup(final Intent intent) {
@@ -108,10 +108,14 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
}
}
- private void results(Bundle bundle) {
- speechService.cancel();
+ private void results(Bundle bundle, boolean isFinal) {
try {
- mCallback.results(bundle);
+ if (isFinal) {
+ speechService.cancel();
+ mCallback.results(bundle);
+ } else {
+ mCallback.partialResults(bundle);
+ }
} catch (RemoteException e) {
// empty
}
@@ -149,7 +153,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
Gson gson = new Gson();
Map map = gson.fromJson(hypothesis, Map.class);
String text = map.get("text");
- results(createResultsBundle(text));
+ results(createResultsBundle(text), true);
}
}
@@ -160,7 +164,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn
Gson gson = new Gson();
Map map = gson.fromJson(hypothesis, Map.class);
String text = map.get("partial");
- results(createResultsBundle(text));
+ results(createResultsBundle(text), false);
}
}