// speech2text7.ck // copyright 2007 Les Hall // This software released under the GNU General Protective License // attempts to convert voice to phonemes with a neural net // control variables 4*1024 => int num_samples; // number of FFT samples num_samples / 2 => int num_freq; // number of FFT frequencies 0.010 => float noise_threshold; // below this is noise 0.500 => float peak_threshold; // you must be this tall to be a peak 0.20 => float lc; // learning coefficient of neural net 5 => int num_phonemes; // number of phonemes in dictionary 1 => int sessions; // number of learning and training sessions 0.003 => float error_threshold; // error must be below this to end training 300000 / (sessions * num_phonemes) => int training_iterations; // number of times to train net // the FFT patch adc => FFT fft =^ Centroid cent => blackhole; num_samples => fft.size; Windowing.hamming(num_samples) => fft.window; Step step_source => dac; // declare variables int peaks[num_freq]; // holds the peaks float fft_norm[num_freq]; // stores the normalized fft magnitudes int peak; // used to find the peak value float max_peak; // holds the maximum peak value int num_peaks; // stores the number of peaks string phoneme; // stores the recognized phoneme string example; // example phoneme float centroid; // centroid in freqency 0 => float step_val; // step value 3 => float step_dur; // step duration in ms // declare neural net variables 7 => int num_input_peaks; // number of peaks to use 2*num_input_peaks+2 => int num_inputs; // number of inputs to neural net num_inputs => int num_hidden; // number of hidden neurons num_phonemes => int num_output; // number of output neurons float Oi[num_inputs]; // input layer outputs float Oia[sessions*num_phonemes][num_inputs]; // input layer outputs for all phonemes float Wh1[num_inputs+1][num_hidden]; // hidden layer 1 weights float Sh1[num_hidden]; // hidden layer 1 sums float Eh1[num_hidden]; // hidden layer 1 errors float Oh1[num_hidden]; // hidden layer 1 outputs float Wh2[num_hidden+1][num_hidden]; // hidden layer 2 weights float Sh2[num_hidden]; // hidden layer 2 sums float Eh2[num_hidden]; // hidden layer 2 errors float Oh2[num_hidden]; // hidden layer 2 outputs float Wo[num_hidden+1][num_output]; // output layer weights float So[num_output]; // output layer sums float Eo[num_output]; // output layer errors float Oo[num_output]; // output layer outputs -1 => int trainchar; // training character 0 => int learning; // 1 if learning vocabulary 0 => int training; // 1 if training, 0 if not training 0 => int running; // 1 after training float y; // desired output of a given output layer neuron float g; // weighted error sum for backprop to hidden layer float max_output; // maximum output int phoneme_index; // index of the phoneme -1 => int sc; // session counter float error; // sum of output error string out_string; // string containing output values string next_char; // next character of out_string // seed the neural network // seed hidden layer 1 weights for (0 => int i; i < num_inputs+1; i++) { for (0 => int j; j< num_hidden; j++) { Std.rand2f(-1.0, 1.0) => Wh1[i][j]; } } // seed hidden layer 2 weights for (0 => int i; i < num_hidden+1; i++) { for (0 => int j; j< num_hidden; j++) { Std.rand2f(-1.0, 1.0) => Wh2[i][j]; } } // seed output layer weights for (0 => int i; i < num_hidden+1; i++) { for (0 => int j; j< num_output; j++) { Std.rand2f(-1.0, 1.0) => Wo[i][j]; } } // time loop while(true) { // compute the FFT fft.upchuck(); // compute the centroid cent.upchuck(); cent.fval(0) * (second/samp) / 2 => centroid; // find peak value 0 => peak; 0 => max_peak; for (0 => int i; i < num_freq; i++) { if (fft.fval(i) > fft.fval(peak)) { fft.fval(i) => max_peak; i => peak; } } // normalize so that biggest frequency is one for (0 => int i; i < num_freq; i++) { if (max_peak != 0) { fft.fval(i) / max_peak => fft_norm[i]; } } // find all peaks above threshold 0 => num_peaks; for (1 => int i; i < num_freq - 1; i++) { 0 => peaks[i]; } for (1 => int i; i < num_freq - 1; i++) { //if ((fft_norm[i] > peak_threshold) && (fft_norm[i-1] < fft_norm[i]) && (fft_norm[i] > fft_norm[i+1])) { if (fft_norm[i] > peak_threshold) { i => peaks[num_peaks]; num_peaks++; } } if ((training == 0) & (running == 0) & (sc < sessions)) { // save input set if ((fft.fval(peak) > noise_threshold) & (sc < sessions)) { // begin by setting up the input layer // first, put the peaks into the input layer for (0 => int i; i < num_input_peaks; i++) { if (peaks[i] == 0) { 0 => Oi[i]; } else { (((peaks[i] $ float) / (num_freq $float)) * (second/samp) / 2) / centroid => Oi[i]; } } // second, put the normalized peak values into the input layer for (0 => int i; i < num_input_peaks; i++) { if (peaks[i] == 0) { 0 => Oi[num_input_peaks+i]; } else { fft_norm[peaks[i]] => Oi[num_input_peaks+i]; } } // third, put the centroid into the input layer centroid / 4000 => Oi[2*num_input_peaks]; // fourth, put the normalized number of peaks into the input layer (num_peaks $ float) / num_input_peaks => Oi[2*num_input_peaks+1]; // save the inputs for training for (0 => int i; i < num_inputs; i++) { Oi[i] => Oia[sc*num_phonemes+trainchar][i]; } (learning + 1) % num_phonemes => learning; // advance time 1::second => now; } // generate a random character and tell the user what it is if ((trainchar < 0) | (fft.fval(peak) > noise_threshold)) { if (trainchar < 0) { <<<"learning vocabulary...", " ">>>; } (trainchar + 1) % num_phonemes => trainchar; if (trainchar == 0) {"A" => phoneme; "cAt" => example;} if (trainchar == 1) {"E" => phoneme; "pEg" => example;} if (trainchar == 2) {"I" => phoneme; "pIg" => example;} if (trainchar == 3) {"O" => phoneme; "lOg" => example;} if (trainchar == 4) {"U" => phoneme; "plUg" => example;} if (trainchar == 5) {"AE" => phoneme; "pAIn" => example;} if (trainchar == 6) {"EE" => phoneme; "swEEt" => example;} if (trainchar == 7) {"IE" => phoneme; "trIEd" => example;} if (trainchar == 8) {"OE" => phoneme; "rOAd" => example;} if (trainchar == 9) {"UE" => phoneme; "mOOn" => example;} if (trainchar == 10) {"OO" => phoneme; "lOOk" => example;} if (trainchar == 11) {"AR" => phoneme; "cARt" => example;} if (trainchar == 12) {"UR" => phoneme; "bURn" => example;} if (trainchar == 13) {"OR" => phoneme; "tORn" => example;} if (trainchar == 14) {"AU" => phoneme; "hAUl" => example;} if (trainchar == 15) {"ER" => phoneme; "sistER" => example;} if (trainchar == 16) {"OW" => phoneme; "dOWn" => example;} if (trainchar == 17) {"OI" => phoneme; "cOIn" => example;} if (trainchar == 18) {"AIR" => phoneme; "stAIRs" => example;} if (trainchar == 19) {"EAR" => phoneme; "fEAR" => example;} if (trainchar == 20) {"B" => phoneme; "Baby" => example;} if (trainchar == 21) {"D" => phoneme; "Dog" => example;} if (trainchar == 22) {"F" => phoneme; "Field" => example;} if (trainchar == 23) {"G" => phoneme; "Game" => example;} if (trainchar == 24) {"H" => phoneme; "Hat" => example;} if (trainchar == 25) {"J" => phoneme; "JuDGE" => example;} if (trainchar == 26) {"K" => phoneme; "CooK" => example;} if (trainchar == 27) {"L" => phoneme; "Lamb" => example;} if (trainchar == 28) {"M" => phoneme; "Monkey" => example;} if (trainchar == 29) {"N" => phoneme; "Nut" => example;} if (trainchar == 30) {"P" => phoneme; "Paper" => example;} if (trainchar == 31) {"R" => phoneme; "Rabbit" => example;} if (trainchar == 32) {"S" => phoneme; "Sun" => example;} if (trainchar == 33) {"T" => phoneme; "Tap" => example;} if (trainchar == 34) {"V" => phoneme; "Van" => example;} if (trainchar == 35) {"W" => phoneme; "Was" => example;} if (trainchar == 36) {"WH" => phoneme; "WHere" => example;} if (trainchar == 37) {"Y" => phoneme; "Yes" => example;} if (trainchar == 38) {"Z" => phoneme; "Zebra" => example;} if (trainchar == 39) {"TH" => phoneme; "THen" => example;} if (trainchar == 40) {"TH" => phoneme; "THin" => example;} if (trainchar == 41) {"CH" => phoneme; "CHip" => example;} if (trainchar == 42) {"SH" => phoneme; "SHip" => example;} if (trainchar == 43) {"ZH" => phoneme; "treaSure" => example;} if (trainchar == 44) {"NG" => phoneme; "riNG" => example;} if (trainchar == 0) { sc++; if (sc == sessions) { 1 => training; } } if (sc < sessions) { <<<"session", sc+1, "/", sessions, " say phoneme: ", phoneme, " example: ", example, " ">>>; } } } // train the neural network if (training == 1) { <<<"training...", " ">>>; // loop on each phoneme for (0 => int l; l < training_iterations; l++) { 0 => error; for (0 => sc; sc < sessions; sc++) { for (0 => trainchar; trainchar < num_phonemes; trainchar++) { // foreward propagate the neural net // apply the inputs for (0 => int i; i < num_inputs; i++) { Oia[sc*num_phonemes+trainchar][i] => Oi[i]; } // calculate the hidden layer 1 sums for (0 => int j; j < num_hidden; j++) { 0 => Sh1[j]; // initialize the sum to zero for (0 => int i; i < num_inputs; i++) { Wh1[i][j] * Oi[i] +=> Sh1[j]; // add all the weight * input terms } Wh1[num_inputs][j] +=> Sh1[j]; // add in the offsets } // next calculate the sigmoid output function for the hidden layer 1 for (0 => int j; j < num_hidden; j++) { 1.0 / (1.0 + Math.exp(-Sh1[j])) => Oh1[j]; step_dur * 1::ms => now; Oh1[j] => step_val; step_val => step_source.next; } // calculate the hidden layer 2 sums for (0 => int j; j < num_hidden; j++) { 0 => Sh2[j]; // initialize the sum to zero for (0 => int i; i < num_hidden; i++) { Wh2[i][j] * Oh1[i] +=> Sh2[j]; // add all the weight * input terms } Wh2[num_hidden][j] +=> Sh2[j]; // add in the offsets } // next calculate the sigmoid output function for the hidden layer 2 for (0 => int j; j < num_hidden; j++) { 1.0 / (1.0 + Math.exp(-Sh2[j])) => Oh2[j]; step_dur * 1::ms => now; Oh2[j] => step_val; step_val => step_source.next; } // then calculate the output layer sums for (0 => int j; j < num_output; j++) { 0 => So[j]; // initialize the sum to zero for (0 => int i; i < num_hidden; i++) { Wo[i][j] * Oh2[i] +=> So[j]; // add all the weight * input terms } Wo[num_hidden][j] +=> So[j]; // add in the offsets } // next calculate the sigmoid output function for the output layer for (0 => int j; j < num_output; j++) { 1.0 / (1.0 + Math.exp(-So[j])) => Oo[j]; step_dur * 1::ms => now; Oo[j] => step_val; step_val => step_source.next; } // backpropagate the neural net // calculate the error functions of the output neurons for (0 => int j; j < num_output; j++) { if (j == trainchar) { 1 => y; } else { 0 => y; } Oo[j] * (1 - Oo[j]) * (y - Oo[j]) => Eo[j]; } // add up the error terms for (0 => int j; j < num_output; j++) { Eo[j] * Eo[j] +=> error; } // update the weights of the output neurons for (0 => int j; j < num_output; j++) { for (0 => int i; i < num_hidden; i++) { lc * Eo[j] * Oh2[i] +=> Wo[i][j]; } } // update the offsets of the output neurons for (0 => int j; j < num_output; j++) { lc * Eo[j] +=> Wo[num_hidden][j]; } // calculate the error at each hidden layer 2 neuron for (0 => int i; i < num_hidden; i++) { 0 => g; for (0 => int j; j < num_output; j++) { Wo[i][j] * Eo[j] +=> g; } Oh2[i] * (1 - Oh2[i]) * g => Eh2[i]; } // update the weights of the output neurons for (0 => int j; j < num_hidden; j++) { for (0 => int i; i < num_hidden; i++) { lc * Eh2[j] * Oh1[i] +=> Wh2[i][j]; } } // update the offsets of the output neurons for (0 => int j; j < num_hidden; j++) { lc * Eh2[j] +=> Wh2[num_hidden][j]; } // calculate the error at each hidden layer 1 neuron for (0 => int i; i < num_hidden; i++) { 0 => g; for (0 => int j; j < num_hidden; j++) { Wh2[i][j] * Eh2[j] +=> g; } Oh1[i] * (1 - Oh1[i]) * g => Eh1[i]; } // update the weights of the output neurons for (0 => int j; j < num_hidden; j++) { for (0 => int i; i < num_inputs; i++) { lc * Eh1[j] * Oi[i] +=> Wh1[i][j]; } } // update the offsets of the output neurons for (0 => int j; j < num_hidden; j++) { lc * Eh1[j] +=> Wh1[num_inputs][j]; } } } if (error < error_threshold) { training_iterations => l; } } // reset the training flag 0 => training; // reset trainchar -1 => trainchar; // set the running flag 1 => running; <<<"training complete with error =", 100*error, "%", " ">>>; <<<"running...", "say any phonemes">>>; } if (running == 1) { if (fft.fval(peak) > noise_threshold) { // begin by setting up the input layer // first, put the peaks into the input layer for (0 => int i; i < num_input_peaks; i++) { if (peaks[i] == 0) { 0 => Oi[i]; } else { (((peaks[i] $ float) / (num_freq $float)) * (second/samp) / 2) / centroid => Oi[i]; } } // second, put the normalized peak values into the input layer for (0 => int i; i < num_input_peaks; i++) { if (peaks[i] == 0) { 0 => Oi[num_input_peaks+i]; } else { fft_norm[peaks[i]] => Oi[num_input_peaks+i]; } } // third, put the centroid into the input layer centroid / 4000 => Oi[2*num_input_peaks]; // fourth, put the normalized number of peaks into the input layer (num_peaks $ float) / num_input_peaks => Oi[2*num_input_peaks+1]; // propagate the neural net // calculate the hidden layer 1 sums for (0 => int j; j < num_hidden; j++) { 0 => Sh1[j]; // initialize the sum to zero for (0 => int i; i < num_inputs; i++) { Wh1[i][j] * Oi[i] +=> Sh1[j]; // add all the weight * input terms } Wh1[num_inputs][j] +=> Sh1[j]; // add in the offsets } // next calculate the sigmoid output function for the hidden layer 1 for (0 => int j; j < num_hidden; j++) { 1.0 / (1.0 + Math.exp(-Sh1[j])) => Oh1[j]; step_dur * 1::ms => now; Oh1[j] => step_val; step_val => step_source.next; } // calculate the hidden layer 2 sums for (0 => int j; j < num_hidden; j++) { 0 => Sh2[j]; // initialize the sum to zero for (0 => int i; i < num_hidden; i++) { Wh2[i][j] * Oh1[i] +=> Sh2[j]; // add all the weight * input terms } Wh2[num_hidden][j] +=> Sh2[j]; // add in the offsets } // next calculate the sigmoid output function for the hidden layer 2 for (0 => int j; j < num_hidden; j++) { 1.0 / (1.0 + Math.exp(-Sh2[j])) => Oh2[j]; step_dur * 1::ms => now; Oh2[j] => step_val; step_val => step_source.next; } // then calculate the output layer sums for (0 => int j; j < num_output; j++) { 0 => So[j]; // initialize the sum to zero for (0 => int i; i < num_hidden; i++) { Wo[i][j] * Oh2[i] +=> So[j]; // add all the weight * input terms } Wo[num_hidden][j] +=> So[j]; // add in the offsets } // next calculate the sigmoid output function for the output layer for (0 => int j; j < num_output; j++) { 1.0 / (1.0 + Math.exp(-So[j])) => Oo[j]; step_dur * 1::ms => now; Oo[j] => step_val; step_val => step_source.next; } // find the strongest output 0 => max_output; -1 => phoneme_index; for (0 => int i; i < num_output; i++) { if (Oo[i] > max_output) { Oo[i] => max_output; i => phoneme_index; } } "" => out_string; for (0 => int i; i < num_output; i++) { "." => next_char; if (Oo[i] >= 0.0) {"0" => next_char;} if (Oo[i] >= 0.1) {"1" => next_char;} if (Oo[i] >= 0.2) {"2" => next_char;} if (Oo[i] >= 0.3) {"3" => next_char;} if (Oo[i] >= 0.4) {"4" => next_char;} if (Oo[i] >= 0.5) {"5" => next_char;} if (Oo[i] >= 0.6) {"6" => next_char;} if (Oo[i] >= 0.7) {"7" => next_char;} if (Oo[i] >= 0.8) {"8" => next_char;} if (Oo[i] >= 0.9) {"9" => next_char;} out_string + next_char => out_string; } if (phoneme_index == -1) {" " => phoneme;} if (phoneme_index == 0) {"A" => phoneme;} if (phoneme_index == 1) {"E" => phoneme;} if (phoneme_index == 2) {"I" => phoneme;} if (phoneme_index == 3) {"O" => phoneme;} if (phoneme_index == 4) {"U" => phoneme;} if (phoneme_index == 5) {"AE" => phoneme;} if (phoneme_index == 6) {"EE" => phoneme;} if (phoneme_index == 7) {"IE" => phoneme;} if (phoneme_index == 8) {"OE" => phoneme;} if (phoneme_index == 9) {"UE" => phoneme;} if (phoneme_index == 10) {"OO" => phoneme;} if (phoneme_index == 11) {"AR" => phoneme;} if (phoneme_index == 12) {"UR" => phoneme;} if (phoneme_index == 13) {"OR" => phoneme;} if (phoneme_index == 14) {"AU" => phoneme;} if (phoneme_index == 15) {"ER" => phoneme;} if (phoneme_index == 16) {"OW" => phoneme;} if (phoneme_index == 17) {"OI" => phoneme;} if (phoneme_index == 18) {"AIR" => phoneme;} if (phoneme_index == 19) {"EAR" => phoneme;} if (phoneme_index == 20) {"B" => phoneme;} if (phoneme_index == 21) {"D" => phoneme;} if (phoneme_index == 22) {"F" => phoneme;} if (phoneme_index == 23) {"G" => phoneme;} if (phoneme_index == 24) {"H" => phoneme;} if (phoneme_index == 25) {"J" => phoneme;} if (phoneme_index == 26) {"K" => phoneme;} if (phoneme_index == 27) {"L" => phoneme;} if (phoneme_index == 28) {"M" => phoneme;} if (phoneme_index == 29) {"N" => phoneme;} if (phoneme_index == 30) {"P" => phoneme;} if (phoneme_index == 31) {"R" => phoneme;} if (phoneme_index == 32) {"S" => phoneme;} if (phoneme_index == 33) {"T" => phoneme;} if (phoneme_index == 34) {"V" => phoneme;} if (phoneme_index == 35) {"W" => phoneme;} if (phoneme_index == 36) {"WH" => phoneme;} if (phoneme_index == 37) {"Y" => phoneme;} if (phoneme_index == 38) {"Z" => phoneme;} if (phoneme_index == 39) {"TH" => phoneme;} if (phoneme_index == 40) {"TH" => phoneme;} if (phoneme_index == 41) {"CH" => phoneme;} if (phoneme_index == 42) {"SH" => phoneme;} if (phoneme_index == 43) {"ZH" => phoneme;} if (phoneme_index == 44) {"NG" => phoneme;} <<>>; } } // advance time num_samples::samp => now; }