/* ----------------------------------------------------------------- */ /* The HMM-Based Speech Synthesis Engine "hts_engine API" */ /* developed by HTS Working Group */ /* http://hts-engine.sourceforge.net/ */ /* ----------------------------------------------------------------- */ /* */ /* Copyright (c) 2001-2013 Nagoya Institute of Technology */ /* Department of Computer Science */ /* */ /* 2001-2008 Tokyo Institute of Technology */ /* Interdisciplinary Graduate School of */ /* Science and Engineering */ /* */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* - Redistributions of source code must retain the above copyright */ /* notice, this list of conditions and the following disclaimer. */ /* - Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials provided */ /* with the distribution. */ /* - Neither the name of the HTS working group nor the names of its */ /* contributors may be used to endorse or promote products derived */ /* from this software without specific prior written permission. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */ /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */ /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */ /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */ /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */ /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */ /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */ /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* ----------------------------------------------------------------- */ #ifndef HTS_ENGINE_C #define HTS_ENGINE_C #ifdef __cplusplus #define HTS_ENGINE_C_START extern "C" { #define HTS_ENGINE_C_END } #else #define HTS_ENGINE_C_START #define HTS_ENGINE_C_END #endif /* __CPLUSPLUS */ HTS_ENGINE_C_START; #include #include "HTS_engine.h" /* usage: output usage */ void usage(void) { fprintf(stderr, "%s\n", HTS_COPYRIGHT); fprintf(stderr, "hts_engine - The HMM-based speech synthesis engine \"hts_engine API\"\n"); fprintf(stderr, "\n"); fprintf(stderr, " usage:\n"); fprintf(stderr, " hts_engine [ options ] [ infile ]\n"); fprintf(stderr, " options: [ def][ min-- max]\n"); fprintf(stderr, " -m htsvoice : HTS voice files [ N/A]\n"); fprintf(stderr, " -od s : filename of output label with duration [ N/A]\n"); fprintf(stderr, " -om s : filename of output spectrum [ N/A]\n"); fprintf(stderr, " -of s : filename of output log F0 [ N/A]\n"); fprintf(stderr, " -ol s : filename of output low-pass filter [ N/A]\n"); fprintf(stderr, " -or s : filename of output raw audio (generated speech) [ N/A]\n"); fprintf(stderr, " -ow s : filename of output wav audio (generated speech) [ N/A]\n"); fprintf(stderr, " -ot s : filename of output trace information [ N/A]\n"); fprintf(stderr, " -vp : use phoneme alignment for duration [ N/A]\n"); fprintf(stderr, " -i i f1 .. fi : enable interpolation & specify number(i),coefficient(f) [ N/A]\n"); fprintf(stderr, " -s i : sampling frequency [ auto][ 1-- ]\n"); fprintf(stderr, " -p i : frame period (point) [ auto][ 1-- ]\n"); fprintf(stderr, " -a f : all-pass constant [ auto][ 0.0-- 1.0]\n"); fprintf(stderr, " -b f : postfiltering coefficient [ 0.0][ 0.0-- 1.0]\n"); fprintf(stderr, " -r f : speech speed rate [ 1.0][ 0.0-- ]\n"); fprintf(stderr, " -fm f : additional half-tone [ 0.0][ -- ]\n"); fprintf(stderr, " -u f : voiced/unvoiced threshold [ 0.5][ 0.0-- 1.0]\n"); fprintf(stderr, " -jm f : weight of GV for spectrum [ 1.0][ 0.0-- ]\n"); fprintf(stderr, " -jf f : weight of GV for log F0 [ 1.0][ 0.0-- ]\n"); fprintf(stderr, " -z i : audio buffer size (if i==0, turn off) [ 0][ 0-- ]\n"); fprintf(stderr, " infile:\n"); fprintf(stderr, " label file\n"); fprintf(stderr, " note:\n"); fprintf(stderr, " generated spectrum, log F0, and low-pass filter coefficient\n"); fprintf(stderr, " sequences are saved in natural endian, binary (float) format.\n"); fprintf(stderr, "\n"); exit(0); } int main(int argc, char **argv) { int i; double f; /* hts_engine API */ HTS_Engine engine; /* HTS voices */ size_t num_voices; char **fn_voices; /* input label file name */ char *labfn = NULL; /* output file pointers */ FILE *durfp = NULL, *mgcfp = NULL, *lf0fp = NULL, *lpffp = NULL, *wavfp = NULL, *rawfp = NULL, *tracefp = NULL; /* interpolation weights */ size_t num_interpolation_weights; /* output usage */ if (argc <= 1) usage(); /* initialize hts_engine API */ HTS_Engine_initialize(&engine); /* get HTS voice file names */ num_voices = 0; fn_voices = (char **) malloc(argc * sizeof(char *)); for (i = 0; i < argc; i++) { if (argv[i][0] == '-' && argv[i][1] == 'm') fn_voices[num_voices++] = argv[++i]; if (argv[i][0] == '-' && argv[i][1] == 'h') usage(); } if (num_voices == 0) { fprintf(stderr, "Error: HTS voice must be specified.\n"); free(fn_voices); exit(1); } /* load HTS voices */ if (HTS_Engine_load(&engine, fn_voices, num_voices) != TRUE) { fprintf(stderr, "Error: HTS voices cannot be loaded.\n"); free(fn_voices); HTS_Engine_clear(&engine); exit(1); } free(fn_voices); /* get options */ while (--argc) { if (**++argv == '-') { switch (*(*argv + 1)) { case 'v': switch (*(*argv + 2)) { case 'p': HTS_Engine_set_phoneme_alignment_flag(&engine, TRUE); break; default: fprintf(stderr, "Error: Invalid option '-v%c'.\n", *(*argv + 2)); HTS_Engine_clear(&engine); exit(1); } break; case 'o': switch (*(*argv + 2)) { case 'w': wavfp = fopen(*++argv, "wb"); break; case 'r': rawfp = fopen(*++argv, "wb"); break; case 'd': durfp = fopen(*++argv, "wt"); break; case 'm': mgcfp = fopen(*++argv, "wb"); break; case 'f': case 'p': lf0fp = fopen(*++argv, "wb"); break; case 'l': lpffp = fopen(*++argv, "wb"); break; case 't': tracefp = fopen(*++argv, "wt"); break; default: fprintf(stderr, "Error: Invalid option '-o%c'.\n", *(*argv + 2)); HTS_Engine_clear(&engine); exit(1); } --argc; break; case 'h': usage(); break; case 'm': argv++; /* HTS voices were already loaded */ --argc; break; case 's': HTS_Engine_set_sampling_frequency(&engine, (size_t) atoi(*++argv)); --argc; break; case 'p': HTS_Engine_set_fperiod(&engine, (size_t) atoi(*++argv)); --argc; break; case 'a': HTS_Engine_set_alpha(&engine, atof(*++argv)); --argc; break; case 'b': HTS_Engine_set_beta(&engine, atof(*++argv)); --argc; break; case 'r': HTS_Engine_set_speed(&engine, atof(*++argv)); --argc; break; case 'f': switch (*(*argv + 2)) { case 'm': HTS_Engine_add_half_tone(&engine, atof(*++argv)); break; default: fprintf(stderr, "Error: Invalid option '-f%c'.\n", *(*argv + 2)); HTS_Engine_clear(&engine); exit(1); } --argc; break; case 'u': HTS_Engine_set_msd_threshold(&engine, 1, atof(*++argv)); --argc; break; case 'i': num_interpolation_weights = atoi(*++argv); argc--; if (num_interpolation_weights != num_voices) { HTS_Engine_clear(&engine); exit(1); } for (i = 0; i < num_interpolation_weights; i++) { f = atof(*++argv); argc--; HTS_Engine_set_duration_interpolation_weight(&engine, i, f); HTS_Engine_set_parameter_interpolation_weight(&engine, i, 0, f); HTS_Engine_set_parameter_interpolation_weight(&engine, i, 1, f); HTS_Engine_set_gv_interpolation_weight(&engine, i, 0, f); HTS_Engine_set_gv_interpolation_weight(&engine, i, 1, f); } break; case 'j': switch (*(*argv + 2)) { case 'm': HTS_Engine_set_gv_weight(&engine, 0, atof(*++argv)); break; case 'f': case 'p': HTS_Engine_set_gv_weight(&engine, 1, atof(*++argv)); break; default: fprintf(stderr, "Error: Invalid option '-j%c'.\n", *(*argv + 2)); HTS_Engine_clear(&engine); exit(1); } --argc; break; default: fprintf(stderr, "Error: Invalid option '-%c'.\n", *(*argv + 1)); HTS_Engine_clear(&engine); exit(1); } } else { labfn = *argv; } } /* synthesize */ if (HTS_Engine_synthesize_from_fn(&engine, labfn) != TRUE) { fprintf(stderr, "Error: waveform cannot be synthesized.\n"); HTS_Engine_clear(&engine); exit(1); } /* output */ if (tracefp != NULL) HTS_Engine_save_information(&engine, tracefp); if (durfp != NULL) HTS_Engine_save_label(&engine, durfp); if (rawfp) HTS_Engine_save_generated_speech(&engine, rawfp); if (wavfp) HTS_Engine_save_riff(&engine, wavfp); if (mgcfp) HTS_Engine_save_generated_parameter(&engine, 0, mgcfp); if (lf0fp) HTS_Engine_save_generated_parameter(&engine, 1, lf0fp); if (lpffp) HTS_Engine_save_generated_parameter(&engine, 2, lpffp); /* reset */ HTS_Engine_refresh(&engine); /* free memory */ HTS_Engine_clear(&engine); /* close files */ if (durfp != NULL) fclose(durfp); if (mgcfp != NULL) fclose(mgcfp); if (lf0fp != NULL) fclose(lf0fp); if (lpffp != NULL) fclose(lpffp); if (wavfp != NULL) fclose(wavfp); if (rawfp != NULL) fclose(rawfp); if (tracefp != NULL) fclose(tracefp); return 0; } HTS_ENGINE_C_END; #endif /* !HTS_ENGINE_C */