2014-03-10 09:32:03 +00:00
/* ----------------------------------------------------------------- */
/* The HMM-Based Speech Synthesis Engine "hts_engine API" */
/* developed by HTS Working Group */
/* http://hts-engine.sourceforge.net/ */
/* ----------------------------------------------------------------- */
/* */
2015-12-30 17:05:25 -06:00
/* Copyright (c) 2001-2015 Nagoya Institute of Technology */
2014-03-10 09:32:03 +00:00
/* Department of Computer Science */
/* */
/* 2001-2008 Tokyo Institute of Technology */
/* Interdisciplinary Graduate School of */
/* Science and Engineering */
/* */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* - Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* - Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials provided */
/* with the distribution. */
/* - Neither the name of the HTS working group nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
/* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
/* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
/* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
/* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* ----------------------------------------------------------------- */
# ifndef HTS_ENGINE_H
# define HTS_ENGINE_H
# ifdef __cplusplus
# define HTS_ENGINE_H_START extern "C" {
# define HTS_ENGINE_H_END }
# else
# define HTS_ENGINE_H_START
# define HTS_ENGINE_H_END
# endif /* __CPLUSPLUS */
HTS_ENGINE_H_START ;
# include <stdio.h>
/* common ---------------------------------------------------------- */
typedef char HTS_Boolean ;
# ifndef TRUE
# define TRUE 1
# endif /* !TRUE */
# ifndef FALSE
# define FALSE 0
# endif /* !FALSE */
# ifndef HTS_NODATA
# define HTS_NODATA (-1.0e+10)
# endif /* HTS_NODATA */
/* copyright ------------------------------------------------------- */
2015-12-30 17:05:25 -06:00
# define HTS_COPYRIGHT "The HMM-Based Speech Synthesis Engine \"hts_engine API\"\nVersion 1.10 (http: //hts-engine.sourceforge.net/)\nCopyright (C) 2001-2015 Nagoya Institute of Technology\n 2001-2008 Tokyo Institute of Technology\nAll rights reserved.\n"
2014-03-10 09:32:03 +00:00
/* audio ----------------------------------------------------------- */
/* HTS_Audio: audio output wrapper */
typedef struct _HTS_Audio {
size_t sampling_frequency ; /* sampling frequency */
size_t max_buff_size ; /* buffer size for audio output interface */
short * buff ; /* current buffer */
size_t buff_size ; /* current buffer size */
void * audio_interface ; /* audio interface specified in compile step */
} HTS_Audio ;
/* model ----------------------------------------------------------- */
/* HTS_Window: window coefficients to calculate dynamic features. */
typedef struct _HTS_Window {
size_t size ; /* # of windows (static + deltas) */
int * l_width ; /* left width of windows */
int * r_width ; /* right width of windows */
double * * coefficient ; /* window coefficient */
size_t max_width ; /* maximum width of windows */
} HTS_Window ;
/* HTS_Pattern: list of patterns in a question and a tree. */
typedef struct _HTS_Pattern {
char * string ; /* pattern string */
struct _HTS_Pattern * next ; /* pointer to the next pattern */
} HTS_Pattern ;
/* HTS_Question: list of questions in a tree. */
typedef struct _HTS_Question {
char * string ; /* name of this question */
HTS_Pattern * head ; /* pointer to the head of pattern list */
struct _HTS_Question * next ; /* pointer to the next question */
} HTS_Question ;
/* HTS_Node: list of tree nodes in a tree. */
typedef struct _HTS_Node {
int index ; /* index of this node */
size_t pdf ; /* index of PDF for this node (leaf node only) */
struct _HTS_Node * yes ; /* pointer to its child node (yes) */
struct _HTS_Node * no ; /* pointer to its child node (no) */
struct _HTS_Node * next ; /* pointer to the next node */
HTS_Question * quest ; /* question applied at this node */
} HTS_Node ;
/* HTS_Tree: list of decision trees in a model. */
typedef struct _HTS_Tree {
HTS_Pattern * head ; /* pointer to the head of pattern list for this tree */
struct _HTS_Tree * next ; /* pointer to next tree */
HTS_Node * root ; /* root node of this tree */
size_t state ; /* state index of this tree */
} HTS_Tree ;
/* HTS_Model: set of PDFs, decision trees and questions. */
typedef struct _HTS_Model {
size_t vector_length ; /* vector length (static features only) */
size_t num_windows ; /* # of windows for delta */
HTS_Boolean is_msd ; /* flag for MSD */
size_t ntree ; /* # of trees */
size_t * npdf ; /* # of PDFs at each tree */
float * * * pdf ; /* PDFs */
HTS_Tree * tree ; /* pointer to the list of trees */
HTS_Question * question ; /* pointer to the list of questions */
} HTS_Model ;
/* HTS_ModelSet: set of duration models, HMMs and GV models. */
typedef struct _HTS_ModelSet {
char * hts_voice_version ; /* version of HTS voice format */
size_t sampling_frequency ; /* sampling frequency */
size_t frame_period ; /* frame period */
size_t num_voices ; /* # of HTS voices */
size_t num_states ; /* # of HMM states */
size_t num_streams ; /* # of streams */
char * stream_type ; /* stream type */
char * fullcontext_format ; /* fullcontext label format */
char * fullcontext_version ; /* version of fullcontext label */
HTS_Question * gv_off_context ; /* GV switch */
char * * option ; /* options for each stream */
HTS_Model * duration ; /* duration PDFs and trees */
HTS_Window * window ; /* window coefficients for delta */
HTS_Model * * stream ; /* parameter PDFs and trees */
HTS_Model * * gv ; /* GV PDFs and trees */
} HTS_ModelSet ;
/* label ----------------------------------------------------------- */
/* HTS_LabelString: individual label string with time information */
typedef struct _HTS_LabelString {
struct _HTS_LabelString * next ; /* pointer to next label string */
char * name ; /* label string */
double start ; /* start frame specified in the given label */
double end ; /* end frame specified in the given label */
} HTS_LabelString ;
/* HTS_Label: list of label strings */
typedef struct _HTS_Label {
HTS_LabelString * head ; /* pointer to the head of label string */
size_t size ; /* # of label strings */
} HTS_Label ;
/* sstream --------------------------------------------------------- */
/* HTS_SStream: individual state stream */
typedef struct _HTS_SStream {
size_t vector_length ; /* vector length (static features only) */
double * * mean ; /* mean vector sequence */
double * * vari ; /* variance vector sequence */
double * msd ; /* MSD parameter sequence */
size_t win_size ; /* # of windows (static + deltas) */
int * win_l_width ; /* left width of windows */
int * win_r_width ; /* right width of windows */
double * * win_coefficient ; /* window cofficients */
size_t win_max_width ; /* maximum width of windows */
double * gv_mean ; /* mean vector of GV */
double * gv_vari ; /* variance vector of GV */
HTS_Boolean * gv_switch ; /* GV flag sequence */
} HTS_SStream ;
/* HTS_SStreamSet: set of state stream */
typedef struct _HTS_SStreamSet {
HTS_SStream * sstream ; /* state streams */
size_t nstream ; /* # of streams */
size_t nstate ; /* # of states */
size_t * duration ; /* duration sequence */
size_t total_state ; /* total state */
size_t total_frame ; /* total frame */
} HTS_SStreamSet ;
/* pstream --------------------------------------------------------- */
/* HTS_SMatrices: matrices/vectors used in the speech parameter generation algorithm. */
typedef struct _HTS_SMatrices {
double * * mean ; /* mean vector sequence */
double * * ivar ; /* inverse diag variance sequence */
double * g ; /* vector used in the forward substitution */
double * * wuw ; /* W' U^-1 W */
double * wum ; /* W' U^-1 mu */
} HTS_SMatrices ;
/* HTS_PStream: individual PDF stream. */
typedef struct _HTS_PStream {
size_t vector_length ; /* vector length (static features only) */
size_t length ; /* stream length */
size_t width ; /* width of dynamic window */
double * * par ; /* output parameter vector */
HTS_SMatrices sm ; /* matrices for parameter generation */
size_t win_size ; /* # of windows (static + deltas) */
int * win_l_width ; /* left width of windows */
int * win_r_width ; /* right width of windows */
double * * win_coefficient ; /* window coefficients */
HTS_Boolean * msd_flag ; /* Boolean sequence for MSD */
double * gv_mean ; /* mean vector of GV */
double * gv_vari ; /* variance vector of GV */
HTS_Boolean * gv_switch ; /* GV flag sequence */
size_t gv_length ; /* frame length for GV calculation */
} HTS_PStream ;
/* HTS_PStreamSet: set of PDF streams. */
typedef struct _HTS_PStreamSet {
HTS_PStream * pstream ; /* PDF streams */
size_t nstream ; /* # of PDF streams */
size_t total_frame ; /* total frame */
} HTS_PStreamSet ;
/* gstream --------------------------------------------------------- */
/* HTS_GStream: generated parameter stream. */
typedef struct _HTS_GStream {
size_t vector_length ; /* vector length (static features only) */
double * * par ; /* generated parameter */
} HTS_GStream ;
/* HTS_GStreamSet: set of generated parameter stream. */
typedef struct _HTS_GStreamSet {
size_t total_nsample ; /* total sample */
size_t total_frame ; /* total frame */
size_t nstream ; /* # of streams */
HTS_GStream * gstream ; /* generated parameter streams */
double * gspeech ; /* generated speech */
} HTS_GStreamSet ;
/* engine ---------------------------------------------------------- */
/* HTS_Condition: synthesis condition */
typedef struct _HTS_Condition {
/* global */
size_t sampling_frequency ; /* sampling frequency */
size_t fperiod ; /* frame period */
size_t audio_buff_size ; /* audio buffer size (for audio device) */
HTS_Boolean stop ; /* stop flag */
double volume ; /* volume */
double * msd_threshold ; /* MSD thresholds */
double * gv_weight ; /* GV weights */
/* duration */
HTS_Boolean phoneme_alignment_flag ; /* flag for using phoneme alignment in label */
double speed ; /* speech speed */
/* spectrum */
size_t stage ; /* if stage=0 then gamma=0 else gamma=-1/stage */
HTS_Boolean use_log_gain ; /* log gain flag (for LSP) */
double alpha ; /* all-pass constant */
double beta ; /* postfiltering coefficient */
/* log F0 */
double additional_half_tone ; /* additional half tone */
/* interpolation weights */
double * duration_iw ; /* weights for duration interpolation */
double * * parameter_iw ; /* weights for parameter interpolation */
double * * gv_iw ; /* weights for GV interpolation */
} HTS_Condition ;
/* HTS_Engine: Engine itself. */
typedef struct _HTS_Engine {
HTS_Condition condition ; /* synthesis condition */
HTS_Audio audio ; /* audio output */
HTS_ModelSet ms ; /* set of duration models, HMMs and GV models */
HTS_Label label ; /* label */
HTS_SStreamSet sss ; /* set of state streams */
HTS_PStreamSet pss ; /* set of PDF streams */
HTS_GStreamSet gss ; /* set of generated parameter streams */
} HTS_Engine ;
/* engine method --------------------------------------------------- */
/* HTS_Engine_initialize: initialize engine */
void HTS_Engine_initialize ( HTS_Engine * engine ) ;
/* HTS_Engine_load: load HTS voices */
HTS_Boolean HTS_Engine_load ( HTS_Engine * engine , char * * voices , size_t num_voices ) ;
/* HTS_Engine_set_sampling_frequency: set sampling fraquency */
void HTS_Engine_set_sampling_frequency ( HTS_Engine * engine , size_t i ) ;
/* HTS_Engine_get_sampling_frequency: get sampling frequency */
size_t HTS_Engine_get_sampling_frequency ( HTS_Engine * engine ) ;
/* HTS_Engine_set_fperiod: set frame period */
void HTS_Engine_set_fperiod ( HTS_Engine * engine , size_t i ) ;
/* HTS_Engine_get_fperiod: get frame period */
size_t HTS_Engine_get_fperiod ( HTS_Engine * engine ) ;
/* HTS_Engine_set_audio_buff_size: set audio buffer size */
void HTS_Engine_set_audio_buff_size ( HTS_Engine * engine , size_t i ) ;
/* HTS_Engine_get_audio_buff_size: get audio buffer size */
size_t HTS_Engine_get_audio_buff_size ( HTS_Engine * engine ) ;
/* HTS_Engine_set_stop_flag: set stop flag */
void HTS_Engine_set_stop_flag ( HTS_Engine * engine , HTS_Boolean b ) ;
/* HTS_Engine_get_stop_flag: get stop flag */
HTS_Boolean HTS_Engine_get_stop_flag ( HTS_Engine * engine ) ;
/* HTS_Engine_set_volume: set volume in db */
void HTS_Engine_set_volume ( HTS_Engine * engine , double f ) ;
/* HTS_Engine_get_volume: get volume in db */
double HTS_Engine_get_volume ( HTS_Engine * engine ) ;
/* HTS_Egnine_set_msd_threshold: set MSD threshold */
void HTS_Engine_set_msd_threshold ( HTS_Engine * engine , size_t stream_index , double f ) ;
/* HTS_Engine_get_msd_threshold: get MSD threshold */
double HTS_Engine_get_msd_threshold ( HTS_Engine * engine , size_t stream_index ) ;
/* HTS_Engine_set_gv_weight: set GV weight */
void HTS_Engine_set_gv_weight ( HTS_Engine * engine , size_t stream_index , double f ) ;
/* HTS_Engine_get_gv_weight: get GV weight */
double HTS_Engine_get_gv_weight ( HTS_Engine * engine , size_t stream_index ) ;
/* HTS_Engine_set_speed: set speech speed */
void HTS_Engine_set_speed ( HTS_Engine * engine , double f ) ;
/* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */
void HTS_Engine_set_phoneme_alignment_flag ( HTS_Engine * engine , HTS_Boolean b ) ;
/* HTS_Engine_set_alpha: set alpha */
void HTS_Engine_set_alpha ( HTS_Engine * engine , double f ) ;
/* HTS_Engine_get_alpha: get alpha */
double HTS_Engine_get_alpha ( HTS_Engine * engine ) ;
/* HTS_Engine_set_beta: set beta */
void HTS_Engine_set_beta ( HTS_Engine * engine , double f ) ;
/* HTS_Engine_get_beta: get beta */
double HTS_Engine_get_beta ( HTS_Engine * engine ) ;
/* HTS_Engine_add_half_tone: add half tone */
void HTS_Engine_add_half_tone ( HTS_Engine * engine , double f ) ;
/* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */
void HTS_Engine_set_duration_interpolation_weight ( HTS_Engine * engine , size_t voice_index , double f ) ;
/* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */
double HTS_Engine_get_duration_interpolation_weight ( HTS_Engine * engine , size_t voice_index ) ;
/* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */
void HTS_Engine_set_parameter_interpolation_weight ( HTS_Engine * engine , size_t voice_index , size_t stream_index , double f ) ;
/* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */
double HTS_Engine_get_parameter_interpolation_weight ( HTS_Engine * engine , size_t voice_index , size_t stream_index ) ;
/* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */
void HTS_Engine_set_gv_interpolation_weight ( HTS_Engine * engine , size_t voice_index , size_t stream_index , double f ) ;
/* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */
double HTS_Engine_get_gv_interpolation_weight ( HTS_Engine * engine , size_t voice_index , size_t stream_index ) ;
/* HTS_Engine_get_total_state: get total number of state */
size_t HTS_Engine_get_total_state ( HTS_Engine * engine ) ;
/* HTS_Engine_set_state_mean: set mean value of state */
void HTS_Engine_set_state_mean ( HTS_Engine * engine , size_t stream_index , size_t state_index , size_t vector_index , double f ) ;
/* HTS_Engine_get_state_mean: get mean value of state */
double HTS_Engine_get_state_mean ( HTS_Engine * engine , size_t stream_index , size_t state_index , size_t vector_index ) ;
/* HTS_Engine_get_state_duration: get state duration */
size_t HTS_Engine_get_state_duration ( HTS_Engine * engine , size_t state_index ) ;
/* HTS_Engine_get_nvoices: get number of voices */
size_t HTS_Engine_get_nvoices ( HTS_Engine * engine ) ;
/* HTS_Engine_get_nstream: get number of stream */
size_t HTS_Engine_get_nstream ( HTS_Engine * engine ) ;
/* HTS_Engine_get_nstate: get number of state */
size_t HTS_Engine_get_nstate ( HTS_Engine * engine ) ;
2015-12-30 17:05:25 -06:00
/* HTS_Engine_get_fullcontext_label_format: get full context label format */
const char * HTS_Engine_get_fullcontext_label_format ( HTS_Engine * engine ) ;
/* HTS_Engine_get_fullcontext_label_version: get full context label version */
const char * HTS_Engine_get_fullcontext_label_version ( HTS_Engine * engine ) ;
2014-03-10 09:32:03 +00:00
/* HTS_Engine_get_total_frame: get total number of frame */
size_t HTS_Engine_get_total_frame ( HTS_Engine * engine ) ;
/* HTS_Engine_get_nsamples: get number of samples */
size_t HTS_Engine_get_nsamples ( HTS_Engine * engine ) ;
/* HTS_Engine_get_generated_parameter: output generated parameter */
double HTS_Engine_get_generated_parameter ( HTS_Engine * engine , size_t stream_index , size_t frame_index , size_t vector_index ) ;
/* HTS_Engine_get_generated_speech: output generated speech */
double HTS_Engine_get_generated_speech ( HTS_Engine * engine , size_t index ) ;
/* HTS_Engine_synthesize_from_fn: synthesize speech from file name */
HTS_Boolean HTS_Engine_synthesize_from_fn ( HTS_Engine * engine , const char * fn ) ;
/* HTS_Engine_synthesize_from_strings: synthesize speech from string list */
HTS_Boolean HTS_Engine_synthesize_from_strings ( HTS_Engine * engine , char * * lines , size_t num_lines ) ;
/* HTS_Engine_generate_state_sequence_from_fn: generate state sequence from file name (1st synthesis step) */
HTS_Boolean HTS_Engine_generate_state_sequence_from_fn ( HTS_Engine * engine , const char * fn ) ;
/* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from string list (1st synthesis step) */
HTS_Boolean HTS_Engine_generate_state_sequence_from_strings ( HTS_Engine * engine , char * * lines , size_t num_lines ) ;
/* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */
HTS_Boolean HTS_Engine_generate_parameter_sequence ( HTS_Engine * engine ) ;
/* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */
HTS_Boolean HTS_Engine_generate_sample_sequence ( HTS_Engine * engine ) ;
/* HTS_Engine_save_information: save trace information */
void HTS_Engine_save_information ( HTS_Engine * engine , FILE * fp ) ;
/* HTS_Engine_save_label: save label with time */
void HTS_Engine_save_label ( HTS_Engine * engine , FILE * fp ) ;
/* HTS_Engine_save_generated_parameter: save generated parameter */
void HTS_Engine_save_generated_parameter ( HTS_Engine * engine , size_t stream_index , FILE * fp ) ;
/* HTS_Engine_save_generated_speech: save generated speech */
void HTS_Engine_save_generated_speech ( HTS_Engine * engine , FILE * fp ) ;
/* HTS_Engine_save_riff: save RIFF format file */
void HTS_Engine_save_riff ( HTS_Engine * engine , FILE * fp ) ;
/* HTS_Engine_refresh: free memory per one time synthesis */
void HTS_Engine_refresh ( HTS_Engine * engine ) ;
/* HTS_Engine_clear: free engine */
void HTS_Engine_clear ( HTS_Engine * engine ) ;
HTS_ENGINE_H_END ;
# endif /* !HTS_ENGINE_H */