diff --git a/.gitignore b/.gitignore index 8a01b30..b3c33ec 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ build .project # JDT-specific (Eclipse Java Development Tools) -.classpath \ No newline at end of file +.classpath + +.vscode \ No newline at end of file diff --git a/app/assets.xml b/app/assets.xml new file mode 100644 index 0000000..e4cb418 --- /dev/null +++ b/app/assets.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/app/build.gradle b/app/build.gradle index c47967a..f2222f8 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -5,6 +5,9 @@ repositories { maven { url "https://dl.bintray.com/alphacep/vosk" } + maven { + url "https://jitpack.io" + } } android { @@ -31,4 +34,10 @@ dependencies { implementation 'com.alphacep:vosk-android:0.3.15' implementation 'androidx.appcompat:appcompat:1.2.0' implementation 'com.google.code.gson:gson:2.8.6' -} \ No newline at end of file + implementation 'org.mozilla.deepspeech:libdeepspeech:0.8.2' + implementation 'com.github.gkonovalov:android-vad:1.0.0' +} + +ant.importBuild 'assets.xml' +preBuild.dependsOn(list, checksum) +clean.dependsOn(clean_assets) \ No newline at end of file diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index efdd316..b157c2b 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -27,5 +27,23 @@ android:name="android.speech" android:resource="@xml/recognition_service" /> + + + + + + + + + + + + \ No newline at end of file diff --git a/app/src/main/assets/sync/assets.lst b/app/src/main/assets/sync/assets.lst new file mode 100644 index 0000000..2e7e59d --- /dev/null +++ b/app/src/main/assets/sync/assets.lst @@ -0,0 +1,26 @@ +deepspeech-catala/kenlm.scorer +deepspeech-catala/model.tflite +vosk-catala/README +vosk-catala/am/final.mdl +vosk-catala/am/tree +vosk-catala/conf/mfcc.conf +vosk-catala/conf/model.conf +vosk-catala/graph/Gr.fst +vosk-catala/graph/HCLr.fst +vosk-catala/graph/disambig_tid.int +vosk-catala/graph/phones/align_lexicon.int +vosk-catala/graph/phones/align_lexicon.txt +vosk-catala/graph/phones/disambig.int +vosk-catala/graph/phones/disambig.txt +vosk-catala/graph/phones/optional_silence.csl +vosk-catala/graph/phones/optional_silence.int +vosk-catala/graph/phones/optional_silence.txt +vosk-catala/graph/phones/silence.csl +vosk-catala/graph/phones/word_boundary.int +vosk-catala/graph/phones/word_boundary.txt +vosk-catala/ivector/final.dubm +vosk-catala/ivector/final.ie +vosk-catala/ivector/final.mat +vosk-catala/ivector/global_cmvn.stats +vosk-catala/ivector/online_cmvn.conf +vosk-catala/ivector/splice.conf \ No newline at end of file diff --git a/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer new file mode 100644 index 0000000..a0f5989 Binary files /dev/null and b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer differ diff --git a/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5 b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5 new file mode 100644 index 0000000..c0d9066 --- /dev/null +++ b/app/src/main/assets/sync/deepspeech-catala/kenlm.scorer.md5 @@ -0,0 +1 @@ +d562825f02f2ba36cbd0a75a17e84e8d diff --git a/app/src/main/assets/sync/deepspeech-catala/model.tflite b/app/src/main/assets/sync/deepspeech-catala/model.tflite new file mode 100644 index 0000000..9c0cfcb Binary files /dev/null and b/app/src/main/assets/sync/deepspeech-catala/model.tflite differ diff --git a/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5 b/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5 new file mode 100644 index 0000000..09c621e --- /dev/null +++ b/app/src/main/assets/sync/deepspeech-catala/model.tflite.md5 @@ -0,0 +1 @@ +735b1327dc3c00af256af64be33cbed3 diff --git a/app/src/main/assets/sync/vosk-catala/README.md5 b/app/src/main/assets/sync/vosk-catala/README.md5 new file mode 100644 index 0000000..424ec31 --- /dev/null +++ b/app/src/main/assets/sync/vosk-catala/README.md5 @@ -0,0 +1 @@ +f49442fa8c9e15bfbb6379c788b3104f diff --git a/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java b/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java new file mode 100644 index 0000000..0f248df --- /dev/null +++ b/app/src/main/java/cat/oreilly/localstt/DeepSpeechRecognitionService.java @@ -0,0 +1,176 @@ +package cat.oreilly.localstt; + +import android.content.Intent; +import android.os.AsyncTask; +import android.os.Bundle; +import android.os.RemoteException; +import android.speech.RecognitionService; +import android.util.Log; +import android.media.AudioFormat; +import android.media.AudioRecord; +import android.media.MediaRecorder; + +import org.kaldi.Assets; +import org.kaldi.RecognitionListener; +import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel; + +import java.io.File; +import java.util.Map; +import java.util.ArrayList; +import java.io.IOException; + +public class DeepSpeechRecognitionService extends RecognitionService implements RecognitionListener { + private final static String TAG = DeepSpeechRecognitionService.class.getSimpleName(); + private DeepSpeechModel model; + private DeepSpeechService speechService; + + private RecognitionService.Callback mCallback; + + @Override + protected void onStartListening(Intent intent, Callback callback) { + mCallback = callback; + Log.i(TAG, "onStartListening"); + runRecognizerSetup(intent); + } + + @Override + protected void onCancel(Callback callback) { + Log.i(TAG, "onCancel"); + results(new Bundle(), true); + } + + @Override + protected void onStopListening(Callback callback) { + Log.i(TAG, "onStopListening"); + results(new Bundle(), true); + } + + private void runRecognizerSetup(final Intent intent) { + new AsyncTask() { + @Override + protected Exception doInBackground(Void... params) { + try { + Assets assets = new Assets(DeepSpeechRecognitionService.this); + File assetDir = assets.syncAssets(); + + model = new DeepSpeechModel(assetDir.toString() + "/deepspeech-catala/model.tflite"); + model.enableExternalScorer(assetDir.toString() + "/deepspeech-catala/kenlm.scorer"); + + setupRecognizer(); + } catch (IOException e) { + return e; + } + return null; + } + + @Override + protected void onPostExecute(Exception result) { + if (result != null) { + Log.e(TAG, "Failed to init recognizer " + result); + error(android.speech.SpeechRecognizer.ERROR_CLIENT); + } else { + readyForSpeech(new Bundle()); + beginningOfSpeech(); + + } + } + }.execute(); + } + + @Override + public void onDestroy() { + super.onDestroy(); + + if (speechService != null) { + speechService.cancel(); + speechService.shutdown(); + } + } + + private void setupRecognizer() throws IOException { + try { + Log.i(TAG, "Setting up recognizer"); + DeepSpeechService speechService = new DeepSpeechService(this.model, 16000.0f); + speechService.addListener(this); + speechService.startListening(); + } catch (IOException e) { + Log.e(TAG, e.getMessage()); + } + } + + private void readyForSpeech(Bundle bundle) { + try { + mCallback.readyForSpeech(bundle); + } catch (RemoteException e) { + // empty + } + } + + private void results(Bundle bundle, boolean isFinal) { + if (speechService != null) { + speechService.cancel(); + } + try { + if (isFinal) { + mCallback.results(bundle); + } else { + mCallback.partialResults(bundle); + } + } catch (RemoteException e) { + // empty + } + } + + private Bundle createResultsBundle(String hypothesis) { + ArrayList hypotheses = new ArrayList<>(); + hypotheses.add(hypothesis); + Bundle bundle = new Bundle(); + bundle.putStringArrayList(android.speech.SpeechRecognizer.RESULTS_RECOGNITION, hypotheses); + return bundle; + } + + private void beginningOfSpeech() { + try { + mCallback.beginningOfSpeech(); + } catch (RemoteException e) { + // empty + } + } + + private void error(int errorCode) { + speechService.cancel(); + try { + mCallback.error(errorCode); + } catch (RemoteException e) { + // empty + } + } + + @Override + public void onResult(String hypothesis) { + if (hypothesis != null) { + Log.i(TAG, hypothesis); + results(createResultsBundle(hypothesis), true); + } + } + + @Override + public void onPartialResult(String hypothesis) { + if (hypothesis != null) { + Log.i(TAG, hypothesis); + results(createResultsBundle(hypothesis), false); + } + } + + @Override + public void onError(Exception e) { + Log.e(TAG, e.getMessage()); + error(android.speech.SpeechRecognizer.ERROR_CLIENT); + } + + @Override + public void onTimeout() { + speechService.cancel(); + speechService.startListening(); + } +} diff --git a/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java b/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java new file mode 100644 index 0000000..7811d84 --- /dev/null +++ b/app/src/main/java/cat/oreilly/localstt/DeepSpeechService.java @@ -0,0 +1,313 @@ +// Copyright 2019 Alpha Cephei Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cat.oreilly.localstt; + +import static java.lang.String.format; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; + +import android.media.AudioFormat; +import android.media.AudioRecord; +import android.media.MediaRecorder.AudioSource; +import android.os.Handler; +import android.os.Looper; +import android.util.Log; + +import org.kaldi.RecognitionListener; +import org.mozilla.deepspeech.libdeepspeech.DeepSpeechModel; +import org.mozilla.deepspeech.libdeepspeech.DeepSpeechStreamingState; + +import com.konovalov.vad.Vad; +import com.konovalov.vad.VadConfig; + +/** + * Service that records audio in a thread, passes it to a recognizer and emits + * recognition results. Recognition events are passed to a client using + * {@link RecognitionListener} + * + */ +public class DeepSpeechService { + + protected static final String TAG = DeepSpeechService.class.getSimpleName(); + + private final DeepSpeechModel model; + private final DeepSpeechStreamingState streamContext; + private final Vad vad; + + private final int sampleRate; + private final static float BUFFER_SIZE_SECONDS = 0.4f; + private int bufferSize; + private final AudioRecord recorder; + + private Thread recognizerThread; + + private final Handler mainHandler = new Handler(Looper.getMainLooper()); + + private final Collection listeners = new HashSet(); + + /** + * Creates speech service. Service holds the AudioRecord object, so you need to + * call {@link release} in order to properly finalize it. + * + * @throws IOException thrown if audio recorder can not be created for some + * reason. + */ + public DeepSpeechService(DeepSpeechModel model, float sampleRate) throws IOException { + this.model = model; + this.sampleRate = (int) sampleRate; + this.streamContext = model.createStream(); + + vad = new Vad(VadConfig.newBuilder().setSampleRate(VadConfig.SampleRate.SAMPLE_RATE_16K) + .setFrameSize(VadConfig.FrameSize.FRAME_SIZE_480).setMode(VadConfig.Mode.NORMAL).build()); + + bufferSize = Math.round(this.sampleRate * BUFFER_SIZE_SECONDS); + recorder = new AudioRecord(AudioSource.VOICE_RECOGNITION, this.sampleRate, AudioFormat.CHANNEL_IN_MONO, + AudioFormat.ENCODING_PCM_16BIT, bufferSize * 2); + + if (recorder.getState() == AudioRecord.STATE_UNINITIALIZED) { + recorder.release(); + throw new IOException("Failed to initialize recorder. Microphone might be already in use."); + } + Log.i(TAG, "DeepSpeechService initialized"); + } + + /** + * Adds listener. + */ + public void addListener(RecognitionListener listener) { + synchronized (listeners) { + listeners.add(listener); + } + } + + /** + * Removes listener. + */ + public void removeListener(RecognitionListener listener) { + synchronized (listeners) { + listeners.remove(listener); + } + } + + /** + * Starts recognition. Does nothing if recognition is active. + * + * @return true if recognition was actually started + */ + public boolean startListening() { + if (null != recognizerThread) + return false; + + recognizerThread = new RecognizerThread(); + recognizerThread.start(); + return true; + } + + /** + * Starts recognition. After specified timeout listening stops and the + * endOfSpeech signals about that. Does nothing if recognition is active. + * + * @timeout - timeout in milliseconds to listen. + * + * @return true if recognition was actually started + */ + public boolean startListening(int timeout) { + if (null != recognizerThread) + return false; + + recognizerThread = new RecognizerThread(timeout); + recognizerThread.start(); + return true; + } + + private boolean stopRecognizerThread() { + if (null == recognizerThread) + return false; + + try { + recognizerThread.interrupt(); + recognizerThread.join(); + } catch (InterruptedException e) { + // Restore the interrupted status. + Thread.currentThread().interrupt(); + } + + recognizerThread = null; + return true; + } + + /** + * Stops recognition. All listeners should receive final result if there is any. + * Does nothing if recognition is not active. + * + * @return true if recognition was actually stopped + */ + public boolean stop() { + boolean result = stopRecognizerThread(); + if (result) { + mainHandler.post(new ResultEvent(model.finishStream(streamContext), true)); + } + return result; + } + + /** + * Cancels recognition. Listeners do not receive final result. Does nothing if + * recognition is not active. + * + * @return true if recognition was actually canceled + */ + public boolean cancel() { + Log.d(TAG, "#cancel"); + boolean result = stopRecognizerThread(); + this.model.freeModel(); // Reset recognizer state + return result; + } + + /** + * Shutdown the recognizer and release the recorder + */ + public void shutdown() { + Log.d(TAG, "#shutdown"); + this.model.freeModel(); + recorder.release(); + } + + private final class RecognizerThread extends Thread { + + private int remainingSamples; + private int timeoutSamples; + private final static int NO_TIMEOUT = -1; + + public RecognizerThread(int timeout) { + if (timeout != NO_TIMEOUT) + this.timeoutSamples = timeout * sampleRate / 1000; + else + this.timeoutSamples = NO_TIMEOUT; + this.remainingSamples = this.timeoutSamples; + } + + public RecognizerThread() { + this(NO_TIMEOUT); + } + + @Override + public void run() { + Log.i(TAG, "Start Recording..."); + + vad.start(); + recorder.startRecording(); + if (recorder.getRecordingState() == AudioRecord.RECORDSTATE_STOPPED) { + recorder.stop(); + IOException ioe = new IOException("Failed to start recording. Microphone might be already in use."); + mainHandler.post(new OnErrorEvent(ioe)); + return; + } + + short[] buffer = new short[bufferSize]; + int nread = recorder.read(buffer, 0, buffer.length); + boolean speechDetected = false; + boolean feedAudio = true; + + while (!interrupted() && ((timeoutSamples == NO_TIMEOUT) || (remainingSamples > 0)) && feedAudio) { + + if (nread < 0) { + throw new RuntimeException("error reading audio buffer"); + } else { + Log.i(TAG, "Feeding audio"); + model.feedAudioContent(streamContext, buffer, nread); + boolean isSpeech = vad.isSpeech(buffer); + if (isSpeech) { + Log.d(TAG, "Speech detected"); + speechDetected = true; + } + if (speechDetected && !isSpeech) { + Log.d(TAG, "Silence detected"); + feedAudio = false; + } + + } + + if (timeoutSamples != NO_TIMEOUT) { + remainingSamples = remainingSamples - nread; + } + nread = recorder.read(buffer, 0, buffer.length); + } + + mainHandler.post(new ResultEvent(model.finishStream(streamContext), true)); + + recorder.stop(); + vad.stop(); + + // Remove all pending notifications. + mainHandler.removeCallbacksAndMessages(null); + + // If we met timeout signal that speech ended + if (timeoutSamples != NO_TIMEOUT && remainingSamples <= 0) { + mainHandler.post(new TimeoutEvent()); + } + } + } + + private abstract class RecognitionEvent implements Runnable { + public void run() { + RecognitionListener[] emptyArray = new RecognitionListener[0]; + for (RecognitionListener listener : listeners.toArray(emptyArray)) + execute(listener); + } + + protected abstract void execute(RecognitionListener listener); + } + + private class ResultEvent extends RecognitionEvent { + protected final String hypothesis; + private final boolean finalResult; + + ResultEvent(String hypothesis, boolean finalResult) { + this.hypothesis = hypothesis; + this.finalResult = finalResult; + } + + @Override + protected void execute(RecognitionListener listener) { + if (finalResult) + listener.onResult(hypothesis); + else + listener.onPartialResult(hypothesis); + } + } + + private class OnErrorEvent extends RecognitionEvent { + private final Exception exception; + + OnErrorEvent(Exception exception) { + this.exception = exception; + } + + @Override + protected void execute(RecognitionListener listener) { + listener.onError(exception); + } + } + + private class TimeoutEvent extends RecognitionEvent { + @Override + protected void execute(RecognitionListener listener) { + listener.onTimeout(); + } + } +} diff --git a/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java b/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java index 1fac1f4..9860d11 100644 --- a/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java +++ b/app/src/main/java/cat/oreilly/localstt/VoskRecognitionService.java @@ -21,7 +21,7 @@ import java.util.ArrayList; import java.io.IOException; public class VoskRecognitionService extends RecognitionService implements RecognitionListener { - private final static String TAG = VoskRecognitionService.class.getName(); + private final static String TAG = VoskRecognitionService.class.getSimpleName(); private KaldiRecognizer recognizer; private SpeechService speechService; private Model model; @@ -38,13 +38,13 @@ public class VoskRecognitionService extends RecognitionService implements Recogn @Override protected void onCancel(Callback callback) { Log.i(TAG, "onCancel"); - results(new Bundle()); + results(new Bundle(), true); } @Override protected void onStopListening(Callback callback) { Log.i(TAG, "onStopListening"); - results(new Bundle()); + results(new Bundle(), true); } private void runRecognizerSetup(final Intent intent) { @@ -108,10 +108,14 @@ public class VoskRecognitionService extends RecognitionService implements Recogn } } - private void results(Bundle bundle) { - speechService.cancel(); + private void results(Bundle bundle, boolean isFinal) { try { - mCallback.results(bundle); + if (isFinal) { + speechService.cancel(); + mCallback.results(bundle); + } else { + mCallback.partialResults(bundle); + } } catch (RemoteException e) { // empty } @@ -149,7 +153,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn Gson gson = new Gson(); Map map = gson.fromJson(hypothesis, Map.class); String text = map.get("text"); - results(createResultsBundle(text)); + results(createResultsBundle(text), true); } } @@ -160,7 +164,7 @@ public class VoskRecognitionService extends RecognitionService implements Recogn Gson gson = new Gson(); Map map = gson.fromJson(hypothesis, Map.class); String text = map.get("partial"); - results(createResultsBundle(text)); + results(createResultsBundle(text), false); } }