speech
Loading...
Searching...
No Matches
main.cpp
1/*
2 * Copyright (C) 2018 iCub Facility - Istituto Italiano di Tecnologia
3 * Author: Vadim Tikhanoff
4 * email: vadim.tikhanoff@iit.it
5 * Permission is granted to copy, distribute, and/or modify this program
6 * under the terms of the GNU General Public License, version 2 or any
7 * later version published by the Free Software Foundation.
8 *
9 * A copy of the license can be found at
10 * http://www.robotcub.org/icub/license/gpl.txt
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
15 * Public License for more details
16 */
17
18
19#include <vector>
20#include <iostream>
21#include <deque>
22#include <cstdio>
23#include <cmath>
24#include <chrono>
25#include <ctime>
26
27#include <fstream>
28#include <iterator>
29#include <string>
30#include <map>
31
32#include <yarp/os/BufferedPort.h>
33#include <yarp/os/ResourceFinder.h>
34#include <yarp/os/RpcClient.h>
35#include <yarp/os/RFModule.h>
36#include <yarp/os/Network.h>
37#include <yarp/os/Time.h>
38#include <yarp/os/Log.h>
39#include <yarp/os/LogStream.h>
40#include <yarp/os/Semaphore.h>
41#include <yarp/sig/SoundFile.h>
42#include <yarp/dev/PolyDriver.h>
43#include <yarp/sig/SoundFile.h>
44
45#include <grpc++/grpc++.h>
46#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"
47
48#include "googleSpeech_IDL.h"
49
50using google::cloud::speech::v1::RecognitionConfig;
51using google::cloud::speech::v1::Speech;
52using google::cloud::speech::v1::RecognizeRequest;
53using google::cloud::speech::v1::RecognizeResponse;
54std::mutex mtx;
55bool is_changed;
56
57static const std::map<grpc::StatusCode, std::string> status_code_to_string {
58 {grpc::OK, "ok"},
59 {grpc::CANCELLED, "cancelled"},
60 {grpc::UNKNOWN, "unknown"},
61 {grpc::INVALID_ARGUMENT, "invalid_argument"},
62 {grpc::DEADLINE_EXCEEDED, "deadline_exceeded"},
63 {grpc::NOT_FOUND, "not_found"},
64 {grpc::ALREADY_EXISTS, "already_exists"},
65 {grpc::PERMISSION_DENIED, "permission_denied"},
66 {grpc::UNAUTHENTICATED, "unauthenticated"},
67 {grpc::RESOURCE_EXHAUSTED , "resource_exhausted"},
68 {grpc::FAILED_PRECONDITION, "failed_precondition"},
69 {grpc::ABORTED, "aborted"},
70 {grpc::OUT_OF_RANGE, "out_of_range"},
71 {grpc::UNIMPLEMENTED, "unimplemented"},
72 {grpc::INTERNAL, "internal"},
73 {grpc::UNAVAILABLE, "unavailable"},
74 {grpc::DATA_LOSS, "data_loss"},
75 {grpc::DO_NOT_USE, "do_not_use"}
76};
77/********************************************************/
78class Processing : public yarp::os::TypedReaderCallback<yarp::sig::Sound>
79{
80 std::string moduleName;
81 yarp::os::RpcServer handlerPort;
82 yarp::os::BufferedPort<yarp::sig::Sound> port;
83 yarp::os::BufferedPort<yarp::os::Bottle> targetPort;
84 yarp::os::RpcClient audioCommand;
85 std::string &state;
86 std::int64_t &elapsed_seconds;
87
88 std::deque<yarp::sig::Sound> sounds;
89
90 int samples;
91 int channels;
92 int padding;
93 bool getSounds;
94 bool sendForQuery;
95 std::string language;
96 int sample_rate;
97
98 bool uniqueSound;
99
100 std::chrono::time_point<std::chrono::system_clock> start, end;
101
102public:
103 /********************************************************/
104
105 Processing( const std::string &moduleName, const std::string &language, const int sample_rate, std::string &state, std::int64_t &elapsed_seconds ) : state(state), elapsed_seconds(elapsed_seconds)
106 {
107 this->moduleName = moduleName;
108 yInfo() << "language " << language;
109 yInfo() << "sample_rate " << sample_rate;
110 this->language = language;
111 this->sample_rate = sample_rate;
112
113 port.useCallback(*this);
114 port.setStrict();
115 samples = 0;
116 channels = 0;
117 padding = 0;
118 getSounds = false;
119 sendForQuery = false;
120 uniqueSound = false;
121 }
122
123 /********************************************************/
124 void setUsingUniqueSound()
125 {
126 uniqueSound = true;
127 getSounds = true;
128 sendForQuery = true;
129 }
130
131 /********************************************************/
132 ~Processing()
133 {
134
135 };
136
137 /********************************************************/
138 bool open()
139 {
140 port.setStrict(true);
141
142 port.open("/" + moduleName + "/sound:i");
143 targetPort.open("/"+ moduleName + "/result:o");
144 audioCommand.open("/"+ moduleName + "/commands:rpc");
145
146 return true;
147 }
148
149 /********************************************************/
150 void close()
151 {
152 port.close();
153 targetPort.close();
154 audioCommand.close();
155 }
156
157 /********************************************************/
158 using yarp::os::TypedReaderCallback<yarp::sig::Sound>::onRead;
159 void onRead( yarp::sig::Sound& sound ) override
160 {
161 std::lock_guard<std::mutex> lg(mtx);
162
163 if(getSounds)
164 {
165 int ct = port.getPendingReads();
166 while (ct>padding)
167 {
168 ct = port.getPendingReads();
169 yWarning() << "Dropping sound packet -- " << ct << " packet(s) behind";
170 port.read();
171 }
172 collectFrame(sound);
173 }
174
175 if(sendForQuery)
176 {
177 //unpack sound
178 yarp::sig::Sound total;
179 total.resize(samples,channels);
180 long int at = 0;
181 while (!sounds.empty()) {
182 yarp::sig::Sound& tmp = sounds.front();
183
184 yDebug() << "channels " << channels;
185 yDebug() << "samples " << tmp.getSamples();
186 yDebug() << "values " << tmp.get(0,0);
187
188 for (int i=0; i<channels; i++) {
189 for (int j=0; j<tmp.getSamples(); j++) {
190 total.set(tmp.get(j,i),at+j,i);
191 }
192 }
193 total.setFrequency(tmp.getFrequency());
194 at += tmp.getSamples();
195 sounds.pop_front();
196 }
197 yarp::os::Bottle &outTargets = targetPort.prepare();
198
199 yarp::os::Bottle cmd, rep;
200 cmd.addString("stop");
201 if (audioCommand.write(cmd, rep))
202 {
203 yDebug() << "cmd.addString(stop)" << rep.toString().c_str();
204 }
205
206 outTargets = queryGoogle(total);
207
208 if (!uniqueSound)
209 sendForQuery = false;
210
211 samples = 0;
212 channels = 0;
213 if(outTargets.size()>0){
214 targetPort.write();
215 }
216 yDebug() << "done querying google";
217 }
218 yarp::os::Time::yield();
219 }
220
221 /********************************************************/
222 void collectFrame(yarp::sig::Sound& sound)
223 {
224 sounds.push_back(sound);
225 samples += sound.getSamples();
226 channels = sound.getChannels();
227 yDebug() << (long int) sounds.size() << "sound frames buffered in memory ( " << (long int) samples << " samples)";
228 }
229
230 /********************************************************/
231 yarp::os::Bottle queryGoogle(yarp::sig::Sound& sound)
232 {
233 RecognizeRequest request;
234
235 yDebug() << "in queryGoogle";
236 yDebug() << "language" << language;
237 yarp::os::Bottle b;
238 b.clear();
239 auto creds = grpc::GoogleDefaultCredentials();
240 auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
241 std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));
242
243 setArguments(request.mutable_config());
244
245 yInfo() << "getFrequency " << sound.getFrequency();
246 yInfo() << "getSamples " << sound.getSamples();
247 yInfo() << "getChannels " << sound.getChannels();
248 yInfo() << "getBytesPerSamples " << sound.getBytesPerSample();
249
250 auto vec_i = sound.getNonInterleavedAudioRawData();
251 auto s1 = std::vector<short>(vec_i.begin(), vec_i.end());
252
253 yInfo() << "AudioRawData s1.size()" << s1.size();
254
255 request.mutable_audio()->mutable_content()->assign((char*)s1.data(), s1.size()*2);
256
257 end = std::chrono::system_clock::now();
258
259 double start_elapsed_seconds = std::chrono::duration_cast<std::chrono::milliseconds> (end-start).count();
260
261 yInfo() << "From start to mutable audio " << start_elapsed_seconds / 1000 << " seconds passed";
262
263 grpc::ClientContext context;
264 RecognizeResponse response;
265
266 start = std::chrono::system_clock::now();
267 if (uniqueSound){
268 checkState("Busy");
269 }
270 yarp::os::Time::delay(0.2);
271 grpc::Status rpc_status = speech->Recognize(&context, request, &response);
272 std::string status_string = status_code_to_string.at(rpc_status.error_code());
273 end = std::chrono::system_clock::now();
274
275 elapsed_seconds = std::chrono::duration_cast<std::chrono::milliseconds> (end-start).count();
276 yInfo() << "Sending to google took " << elapsed_seconds << " ms";
277
278 if (!rpc_status.ok()) {
279 // Report the RPC failure.
280 yInfo() << rpc_status.error_message();
281 b.clear();
282 checkState("Failure_" + status_string);
283 }
284 else{
285 yInfo() << "Size of response " << response.results_size();
286 if(response.results_size()>0){
287 checkState("Done");
288
289 // Dump the transcript of all the results.
290 for (int r = 0; r < response.results_size(); ++r)
291 {
292 auto result = response.results(r);
293 for (int a = 0; a < result.alternatives_size(); ++a)
294 {
295 auto alternative = result.alternatives(a);
296 yInfo() << alternative.confidence();
297 yInfo() << alternative.transcript();
298 b.addString(alternative.transcript());
299 }
300 }
301 }
302 else{
303 checkState("Empty");
304 }
305 }
306 return b;
307 }
308
309 /********************************************************/
310 bool setLanguageCode(const std::string &languageCode)
311 {
312 language = languageCode;
313 return true;
314 }
315
316 /********************************************************/
317 std::string getLanguageCode()
318 {
319 return language;
320 }
321
322 /********************************************************/
323 void setArguments(RecognitionConfig* config)
324 {
325 config->set_language_code(language.c_str());
326 config->set_sample_rate_hertz(sample_rate);
327 config->set_encoding(RecognitionConfig::LINEAR16);
328
329 config->set_use_enhanced(true); // Can be used with the correct model. If true but no model specified, it does nothing.
330 auto metadata = config->mutable_metadata();
331
332 metadata->set_microphone_distance(google::cloud::speech::v1::RecognitionMetadata_MicrophoneDistance_MIDFIELD);
333 metadata->set_recording_device_type(google::cloud::speech::v1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE);
334 metadata->set_interaction_type(google::cloud::speech::v1::RecognitionMetadata_InteractionType_VOICE_COMMAND);
335 metadata->set_original_media_type(google::cloud::speech::v1::RecognitionMetadata_OriginalMediaType_AUDIO);
336 }
337
338 /********************************************************/
339 bool start_acquisition()
340 {
341 std::lock_guard<std::mutex> lg(mtx);
342 yarp::os::Bottle cmd, rep;
343 //cmd.addVocab32("start");
344 cmd.addString("start");
345 if (audioCommand.write(cmd, rep))
346 {
347 yDebug() << "cmd.addString(start)" << rep.toString().c_str();
348 }
349
350 start = std::chrono::system_clock::now();
351 getSounds = true;
352 checkState("Listening");
353 return true;
354 }
355
356 /********************************************************/
357 bool stop_acquisition()
358 {
359 std::lock_guard<std::mutex> lg(mtx);
360
361 if (!uniqueSound)
362 getSounds = false;
363
364 sendForQuery = true;
365 checkState("Busy");
366 return true;
367 }
368 /********************************************************/
369 bool checkState(std::string new_state)
370 {
371 if(new_state!=state){
372 is_changed=true;
373 state=new_state;
374 }
375 else{
376 is_changed=false;
377 }
378 return is_changed;
379 }
380};
381
382/********************************************************/
383class Module : public yarp::os::RFModule, public googleSpeech_IDL
384{
385 yarp::os::ResourceFinder *rf;
386 yarp::os::RpcServer rpcPort;
387 std::string state;
388 std::int64_t elapsed_seconds;
389 yarp::os::BufferedPort<yarp::os::Bottle> statePort;
390
391 Processing *processing;
392 friend class processing;
393
394 bool closing;
395 bool uniqueSound;
396 std::vector<std::string> allLanguageCodes;
397
398 /********************************************************/
399 bool attach(yarp::os::RpcServer &source)
400 {
401 return this->yarp().attachAsServer(source);
402 }
403
404public:
405
406 /********************************************************/
407 bool configure(yarp::os::ResourceFinder &rf)
408 {
409 this->rf=&rf;
410 this->state="Ready";
411 this->elapsed_seconds=0;
412 uniqueSound = false;
413
414 std::string moduleName = rf.check("name", yarp::os::Value("googleSpeech"), "module name (string)").asString();
415 std::string language = rf.check("language_code", yarp::os::Value("en-US"), "language (string)").asString();
416 int sample_rate = rf.check("sample_rate_hertz", yarp::os::Value(16000), "sample rate (int)").asInt32();
417
418 if (rf.check("uniqueSound", "use a yarp::sig::Sound instead of a microphone"))
419 uniqueSound = true;
420
421 if (rf.check("languageCodes", "Getting language codes"))
422 {
423 yarp::os::Bottle &grp=rf.findGroup("languageCodes");
424 int sz=grp.size()-1;
425
426 for (int i=0; i<sz; i++)
427 allLanguageCodes.push_back(grp.get(1+i).asString());
428 }
429
430 setName(moduleName.c_str());
431
432 rpcPort.open(("/"+getName("/rpc")).c_str());
433 statePort.open("/"+ moduleName + "/state:o");
434
435 closing = false;
436
437 processing = new Processing( moduleName, language, sample_rate, state, elapsed_seconds);
438
439 /* now start the thread to do the work */
440 processing->open();
441
442 if (uniqueSound)
443 processing->setUsingUniqueSound();
444
445 attach(rpcPort);
446
447 return true;
448 }
449
450 /********************************************************/
451 bool setLanguage(const std::string& languageCode)
452 {
453 bool returnVal = false;
454
455 std::string language;
456
457 for (int i = 0; i < allLanguageCodes.size(); i++)
458 {
459 if (languageCode == allLanguageCodes[i])
460 {
461 language = languageCode;
462 processing->setLanguageCode(languageCode);
463 returnVal = true;
464 break;
465 }
466 }
467
468 return returnVal;
469 }
470
471 /********************************************************/
472 std::string getLanguageCode()
473 {
474 return processing->getLanguageCode();
475 }
476
477
478 /**********************************************************/
479 bool close()
480 {
481 statePort.close();
482 processing->close();
483 delete processing;
484 return true;
485 }
486
487 /**********************************************************/
488 bool start()
489 {
490 processing->start_acquisition();
491 return true;
492 }
493
494 /**********************************************************/
495 bool stop()
496 {
497 processing->stop_acquisition();
498 return true;
499 }
500
501 /********************************************************/
502 double getPeriod()
503 {
504 return 0.1;
505 }
506
507 /********************************************************/
508 bool quit()
509 {
510 closing=true;
511 return true;
512 }
513
514 /********************************************************/
515 bool updateModule()
516 {
517 if(is_changed){
518 is_changed=false;
519 yarp::os::Bottle &outTargets = statePort.prepare();
520 outTargets.clear();
521 outTargets.addString(state);
522 yDebug() << "outTarget:" << outTargets.toString().c_str();
523 statePort.write();
524 }
525 return !closing;
526 }
527 /********************************************************/
528 std::string getState()
529 {
530 return state;
531 }
532
533 /********************************************************/
534 std::int64_t getProcessingTime()
535 {
536 return elapsed_seconds;
537 }
538
539};
540
541/********************************************************/
542int main(int argc, char *argv[])
543{
544 yarp::os::Network::init();
545
546 yarp::os::Network yarp;
547 if (!yarp.checkNetwork())
548 {
549 yError("YARP server not available!");
550 return 1;
551 }
552
553 Module module;
554 yarp::os::ResourceFinder rf;
555
556 rf.setVerbose( true );
557 rf.setDefaultContext( "googleSpeech" );
558 rf.setDefaultConfigFile( "config.ini" );
559 rf.setDefault("name","googleSpeech");
560 rf.configure(argc,argv);
561
562 return module.runModule(rf);
563}