speech
All Data Structures Functions Modules Pages
main.cpp
1 /*
2  * Copyright (C) 2018 iCub Facility - Istituto Italiano di Tecnologia
3  * Author: Vadim Tikhanoff
4  * email: vadim.tikhanoff@iit.it
5  * Permission is granted to copy, distribute, and/or modify this program
6  * under the terms of the GNU General Public License, version 2 or any
7  * later version published by the Free Software Foundation.
8  *
9  * A copy of the license can be found at
10  * http://www.robotcub.org/icub/license/gpl.txt
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
15  * Public License for more details
16  */
17 
18 
19 #include <vector>
20 #include <iostream>
21 #include <deque>
22 #include <cstdio>
23 #include <cmath>
24 #include <chrono>
25 #include <ctime>
26 
27 #include <fstream>
28 #include <iterator>
29 #include <string>
30 #include <map>
31 
32 #include <yarp/os/BufferedPort.h>
33 #include <yarp/os/ResourceFinder.h>
34 #include <yarp/os/RpcClient.h>
35 #include <yarp/os/RFModule.h>
36 #include <yarp/os/Network.h>
37 #include <yarp/os/Time.h>
38 #include <yarp/os/Log.h>
39 #include <yarp/os/LogStream.h>
40 #include <yarp/os/Semaphore.h>
41 #include <yarp/sig/SoundFile.h>
42 #include <yarp/dev/PolyDriver.h>
43 #include <yarp/sig/SoundFile.h>
44 
45 #include <grpc++/grpc++.h>
46 #include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"
47 
48 #include "googleSpeech_IDL.h"
49 
50 using google::cloud::speech::v1::RecognitionConfig;
51 using google::cloud::speech::v1::Speech;
52 using google::cloud::speech::v1::RecognizeRequest;
53 using google::cloud::speech::v1::RecognizeResponse;
54 std::mutex mtx;
55 bool is_changed;
56 
57 static const std::map<grpc::StatusCode, std::string> status_code_to_string {
58  {grpc::OK, "ok"},
59  {grpc::CANCELLED, "cancelled"},
60  {grpc::UNKNOWN, "unknown"},
61  {grpc::INVALID_ARGUMENT, "invalid_argument"},
62  {grpc::DEADLINE_EXCEEDED, "deadline_exceeded"},
63  {grpc::NOT_FOUND, "not_found"},
64  {grpc::ALREADY_EXISTS, "already_exists"},
65  {grpc::PERMISSION_DENIED, "permission_denied"},
66  {grpc::UNAUTHENTICATED, "unauthenticated"},
67  {grpc::RESOURCE_EXHAUSTED , "resource_exhausted"},
68  {grpc::FAILED_PRECONDITION, "failed_precondition"},
69  {grpc::ABORTED, "aborted"},
70  {grpc::OUT_OF_RANGE, "out_of_range"},
71  {grpc::UNIMPLEMENTED, "unimplemented"},
72  {grpc::INTERNAL, "internal"},
73  {grpc::UNAVAILABLE, "unavailable"},
74  {grpc::DATA_LOSS, "data_loss"},
75  {grpc::DO_NOT_USE, "do_not_use"}
76 };
77 /********************************************************/
78 class Processing : public yarp::os::TypedReaderCallback<yarp::sig::Sound>
79 {
80  std::string moduleName;
81  yarp::os::RpcServer handlerPort;
82  yarp::os::BufferedPort<yarp::sig::Sound> port;
83  yarp::os::BufferedPort<yarp::os::Bottle> targetPort;
84  yarp::os::RpcClient audioCommand;
85  std::string &state;
86  std::int64_t &elapsed_seconds;
87 
88  std::deque<yarp::sig::Sound> sounds;
89 
90  int samples;
91  int channels;
92  int padding;
93  bool getSounds;
94  bool sendForQuery;
95  std::string language;
96  int sample_rate;
97 
98  bool uniqueSound;
99 
100  std::chrono::time_point<std::chrono::system_clock> start, end;
101 
102 public:
103  /********************************************************/
104 
105  Processing( const std::string &moduleName, const std::string &language, const int sample_rate, std::string &state, std::int64_t &elapsed_seconds ) : state(state), elapsed_seconds(elapsed_seconds)
106  {
107  this->moduleName = moduleName;
108  yInfo() << "language " << language;
109  yInfo() << "sample_rate " << sample_rate;
110  this->language = language;
111  this->sample_rate = sample_rate;
112 
113  port.useCallback(*this);
114  port.setStrict();
115  samples = 0;
116  channels = 0;
117  padding = 0;
118  getSounds = false;
119  sendForQuery = false;
120  uniqueSound = false;
121  }
122 
123  /********************************************************/
124  void setUsingUniqueSound()
125  {
126  uniqueSound = true;
127  getSounds = true;
128  sendForQuery = true;
129  }
130 
131  /********************************************************/
132  ~Processing()
133  {
134 
135  };
136 
137  /********************************************************/
138  bool open()
139  {
140  port.setStrict(true);
141 
142  port.open("/" + moduleName + "/sound:i");
143  targetPort.open("/"+ moduleName + "/result:o");
144  audioCommand.open("/"+ moduleName + "/commands:rpc");
145 
146  return true;
147  }
148 
149  /********************************************************/
150  void close()
151  {
152  port.close();
153  targetPort.close();
154  audioCommand.close();
155  }
156 
157  /********************************************************/
158  using yarp::os::TypedReaderCallback<yarp::sig::Sound>::onRead;
159  void onRead( yarp::sig::Sound& sound ) override
160  {
161  std::lock_guard<std::mutex> lg(mtx);
162 
163  if(getSounds)
164  {
165  int ct = port.getPendingReads();
166  while (ct>padding)
167  {
168  ct = port.getPendingReads();
169  yWarning() << "Dropping sound packet -- " << ct << " packet(s) behind";
170  port.read();
171  }
172  collectFrame(sound);
173  }
174 
175  if(sendForQuery)
176  {
177  //unpack sound
178  yarp::sig::Sound total;
179  total.resize(samples,channels);
180  long int at = 0;
181  while (!sounds.empty()) {
182  yarp::sig::Sound& tmp = sounds.front();
183 
184  yDebug() << "channels " << channels;
185  yDebug() << "samples " << tmp.getSamples();
186  yDebug() << "values " << tmp.get(0,0);
187 
188  for (int i=0; i<channels; i++) {
189  for (int j=0; j<tmp.getSamples(); j++) {
190  total.set(tmp.get(j,i),at+j,i);
191  }
192  }
193  total.setFrequency(tmp.getFrequency());
194  at += tmp.getSamples();
195  sounds.pop_front();
196  }
197  yarp::os::Bottle &outTargets = targetPort.prepare();
198 
199  yarp::os::Bottle cmd, rep;
200  cmd.addString("stop");
201  if (audioCommand.write(cmd, rep))
202  {
203  yDebug() << "cmd.addString(stop)" << rep.toString().c_str();
204  }
205 
206  outTargets = queryGoogle(total);
207 
208  if (!uniqueSound)
209  sendForQuery = false;
210 
211  samples = 0;
212  channels = 0;
213  if(outTargets.size()>0){
214  targetPort.write();
215  }
216  yDebug() << "done querying google";
217  }
218  yarp::os::Time::yield();
219  }
220 
221  /********************************************************/
222  void collectFrame(yarp::sig::Sound& sound)
223  {
224  sounds.push_back(sound);
225  samples += sound.getSamples();
226  channels = sound.getChannels();
227  yDebug() << (long int) sounds.size() << "sound frames buffered in memory ( " << (long int) samples << " samples)";
228  }
229 
230  /********************************************************/
231  yarp::os::Bottle queryGoogle(yarp::sig::Sound& sound)
232  {
233  RecognizeRequest request;
234 
235  yDebug() << "in queryGoogle";
236  yDebug() << "language" << language;
237  yarp::os::Bottle b;
238  b.clear();
239  auto creds = grpc::GoogleDefaultCredentials();
240  auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
241  std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));
242 
243  setArguments(request.mutable_config());
244 
245  yInfo() << "getFrequency " << sound.getFrequency();
246  yInfo() << "getSamples " << sound.getSamples();
247  yInfo() << "getChannels " << sound.getChannels();
248  yInfo() << "getBytesPerSamples " << sound.getBytesPerSample();
249 
250  auto vec_i = sound.getNonInterleavedAudioRawData();
251  auto s1 = std::vector<short>(vec_i.begin(), vec_i.end());
252 
253  yInfo() << "AudioRawData s1.size()" << s1.size();
254 
255  request.mutable_audio()->mutable_content()->assign((char*)s1.data(), s1.size()*2);
256 
257  end = std::chrono::system_clock::now();
258 
259  double start_elapsed_seconds = std::chrono::duration_cast<std::chrono::milliseconds> (end-start).count();
260 
261  yInfo() << "From start to mutable audio " << start_elapsed_seconds / 1000 << " seconds passed";
262 
263  grpc::ClientContext context;
264  RecognizeResponse response;
265 
266  start = std::chrono::system_clock::now();
267  if (uniqueSound){
268  checkState("Busy");
269  }
270  yarp::os::Time::delay(0.2);
271  grpc::Status rpc_status = speech->Recognize(&context, request, &response);
272  std::string status_string = status_code_to_string.at(rpc_status.error_code());
273  end = std::chrono::system_clock::now();
274 
275  elapsed_seconds = std::chrono::duration_cast<std::chrono::milliseconds> (end-start).count();
276  yInfo() << "Sending to google took " << elapsed_seconds << " ms";
277 
278  if (!rpc_status.ok()) {
279  // Report the RPC failure.
280  yInfo() << rpc_status.error_message();
281  b.clear();
282  checkState("Failure_" + status_string);
283  }
284  else{
285  yInfo() << "Size of response " << response.results_size();
286  if(response.results_size()>0){
287  checkState("Done");
288 
289  // Dump the transcript of all the results.
290  for (int r = 0; r < response.results_size(); ++r)
291  {
292  auto result = response.results(r);
293  for (int a = 0; a < result.alternatives_size(); ++a)
294  {
295  auto alternative = result.alternatives(a);
296  yInfo() << alternative.confidence();
297  yInfo() << alternative.transcript();
298  b.addString(alternative.transcript());
299  }
300  }
301  }
302  else{
303  checkState("Empty");
304  }
305  }
306  return b;
307  }
308 
309  /********************************************************/
310  bool setLanguageCode(const std::string &languageCode)
311  {
312  language = languageCode;
313  return true;
314  }
315 
316  /********************************************************/
317  std::string getLanguageCode()
318  {
319  return language;
320  }
321 
322  /********************************************************/
323  void setArguments(RecognitionConfig* config)
324  {
325  config->set_language_code(language.c_str());
326  config->set_sample_rate_hertz(sample_rate);
327  config->set_encoding(RecognitionConfig::LINEAR16);
328 
329  config->set_use_enhanced(true); // Can be used with the correct model. If true but no model specified, it does nothing.
330  auto metadata = config->mutable_metadata();
331 
332  metadata->set_microphone_distance(google::cloud::speech::v1::RecognitionMetadata_MicrophoneDistance_MIDFIELD);
333  metadata->set_recording_device_type(google::cloud::speech::v1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE);
334  metadata->set_interaction_type(google::cloud::speech::v1::RecognitionMetadata_InteractionType_VOICE_COMMAND);
335  metadata->set_original_media_type(google::cloud::speech::v1::RecognitionMetadata_OriginalMediaType_AUDIO);
336  }
337 
338  /********************************************************/
339  bool start_acquisition()
340  {
341  std::lock_guard<std::mutex> lg(mtx);
342  yarp::os::Bottle cmd, rep;
343  //cmd.addVocab32("start");
344  cmd.addString("start");
345  if (audioCommand.write(cmd, rep))
346  {
347  yDebug() << "cmd.addString(start)" << rep.toString().c_str();
348  }
349 
350  start = std::chrono::system_clock::now();
351  getSounds = true;
352  checkState("Listening");
353  return true;
354  }
355 
356  /********************************************************/
357  bool stop_acquisition()
358  {
359  std::lock_guard<std::mutex> lg(mtx);
360 
361  if (!uniqueSound)
362  getSounds = false;
363 
364  sendForQuery = true;
365  checkState("Busy");
366  return true;
367  }
368  /********************************************************/
369  bool checkState(std::string new_state)
370  {
371  if(new_state!=state){
372  is_changed=true;
373  state=new_state;
374  }
375  else{
376  is_changed=false;
377  }
378  return is_changed;
379  }
380 };
381 
382 /********************************************************/
383 class Module : public yarp::os::RFModule, public googleSpeech_IDL
384 {
385  yarp::os::ResourceFinder *rf;
386  yarp::os::RpcServer rpcPort;
387  std::string state;
388  std::int64_t elapsed_seconds;
389  yarp::os::BufferedPort<yarp::os::Bottle> statePort;
390 
391  Processing *processing;
392  friend class processing;
393 
394  bool closing;
395  bool uniqueSound;
396  std::vector<std::string> allLanguageCodes;
397 
398  /********************************************************/
399  bool attach(yarp::os::RpcServer &source)
400  {
401  return this->yarp().attachAsServer(source);
402  }
403 
404 public:
405 
406  /********************************************************/
407  bool configure(yarp::os::ResourceFinder &rf)
408  {
409  this->rf=&rf;
410  this->state="Ready";
411  this->elapsed_seconds=0;
412  uniqueSound = false;
413 
414  std::string moduleName = rf.check("name", yarp::os::Value("googleSpeech"), "module name (string)").asString();
415  std::string language = rf.check("language_code", yarp::os::Value("en-US"), "language (string)").asString();
416  int sample_rate = rf.check("sample_rate_hertz", yarp::os::Value(16000), "sample rate (int)").asInt32();
417 
418  if (rf.check("uniqueSound", "use a yarp::sig::Sound instead of a microphone"))
419  uniqueSound = true;
420 
421  if (rf.check("languageCodes", "Getting language codes"))
422  {
423  yarp::os::Bottle &grp=rf.findGroup("languageCodes");
424  int sz=grp.size()-1;
425 
426  for (int i=0; i<sz; i++)
427  allLanguageCodes.push_back(grp.get(1+i).asString());
428  }
429 
430  setName(moduleName.c_str());
431 
432  rpcPort.open(("/"+getName("/rpc")).c_str());
433  statePort.open("/"+ moduleName + "/state:o");
434 
435  closing = false;
436 
437  processing = new Processing( moduleName, language, sample_rate, state, elapsed_seconds);
438 
439  /* now start the thread to do the work */
440  processing->open();
441 
442  if (uniqueSound)
443  processing->setUsingUniqueSound();
444 
445  attach(rpcPort);
446 
447  return true;
448  }
449 
450  /********************************************************/
451  bool setLanguage(const std::string& languageCode)
452  {
453  bool returnVal = false;
454 
455  std::string language;
456 
457  for (int i = 0; i < allLanguageCodes.size(); i++)
458  {
459  if (languageCode == allLanguageCodes[i])
460  {
461  language = languageCode;
462  processing->setLanguageCode(languageCode);
463  returnVal = true;
464  break;
465  }
466  }
467 
468  return returnVal;
469  }
470 
471  /********************************************************/
472  std::string getLanguageCode()
473  {
474  return processing->getLanguageCode();
475  }
476 
477 
478  /**********************************************************/
479  bool close()
480  {
481  statePort.close();
482  processing->close();
483  delete processing;
484  return true;
485  }
486 
487  /**********************************************************/
488  bool start()
489  {
490  processing->start_acquisition();
491  return true;
492  }
493 
494  /**********************************************************/
495  bool stop()
496  {
497  processing->stop_acquisition();
498  return true;
499  }
500 
501  /********************************************************/
502  double getPeriod()
503  {
504  return 0.1;
505  }
506 
507  /********************************************************/
508  bool quit()
509  {
510  closing=true;
511  return true;
512  }
513 
514  /********************************************************/
515  bool updateModule()
516  {
517  if(is_changed){
518  is_changed=false;
519  yarp::os::Bottle &outTargets = statePort.prepare();
520  outTargets.clear();
521  outTargets.addString(state);
522  yDebug() << "outTarget:" << outTargets.toString().c_str();
523  statePort.write();
524  }
525  return !closing;
526  }
527  /********************************************************/
528  std::string getState()
529  {
530  return state;
531  }
532 
533  /********************************************************/
534  std::int64_t getProcessingTime()
535  {
536  return elapsed_seconds;
537  }
538 
539 };
540 
541 /********************************************************/
542 int main(int argc, char *argv[])
543 {
544  yarp::os::Network::init();
545 
546  yarp::os::Network yarp;
547  if (!yarp.checkNetwork())
548  {
549  yError("YARP server not available!");
550  return 1;
551  }
552 
553  Module module;
554  yarp::os::ResourceFinder rf;
555 
556  rf.setVerbose( true );
557  rf.setDefaultContext( "googleSpeech" );
558  rf.setDefaultConfigFile( "config.ini" );
559  rf.setDefault("name","googleSpeech");
560  rf.configure(argc,argv);
561 
562  return module.runModule(rf);
563 }