2025-11-28 00:35:46 +09:00

408 lines
14 KiB
C++

// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
// PARTICULAR PURPOSE.
//
// Copyright © Microsoft Corporation. All rights reserved
/******************************************************************************
* srengobj.h
* This file contains the declaration of the CSrEngine class.
* This implements ISpSREngine, ISpSREngine2 and ISpObjectWithToken.
* This is the main recognition object
******************************************************************************/
#pragma once
#include "stdafx.h"
#include "SampleSrEngine.h"
#include "resource.h"
// A list of reco contexts is stored. Each entry in the list is an instance of this class.
class CContext
{
public:
CContext * m_pNext;
BOOL operator==(SPRECOCONTEXTHANDLE hContext)
{
return (m_hSapiContext == hContext);
}
CContext(SPRECOCONTEXTHANDLE hSapiContext) :
m_hSapiContext(hSapiContext)
{}
SPRECOCONTEXTHANDLE m_hSapiContext; // The reco context handle given by SAPI
};
// A list of reco grammars is stored. Each entry in the list is an instance of this class.
class CDrvGrammar
{
public:
CDrvGrammar * m_pNext;
SPGRAMMARHANDLE m_hSapiGrammar; // The grammar handle given by SAPI
BOOL m_SLMLoaded; // Does the grammar have an associated SLM for dictation
BOOL m_SLMActive; // Is the dictation active
WCHAR* m_pWordSequenceText; // The text of the word sequence buffer if one is set
ULONG m_cchText; // The size of the word sequence buffer
SPTEXTSELECTIONINFO* m_pInfo; // The text selection of the word sequence buffer
CDrvGrammar(SPGRAMMARHANDLE hSapiGrammar) :
m_hSapiGrammar(hSapiGrammar),
m_SLMLoaded(FALSE),
m_SLMActive(FALSE),
m_pWordSequenceText(NULL),
m_cchText(0),
m_pInfo(NULL)
{
}
~CDrvGrammar()
{
// Free up resources
//For each grammar object going to be released, SAPI would call SetWordSequenceData(NULL, 0, NULL).
//SetWordSequenceData and SetTextSelection would release the memories.
//There is no need to release memories referred by m_pWordSequenceText and m_pInfo here.
}
#ifdef _WIN32_WCE
CDrvGrammar()
{
}
static LONG Compare(const CDrvGrammar *, const CDrvGrammar *)
{
return 0;
}
#endif
};
// The RecognizeStream thread read audio data in blocks. For each block
// it decides if the data is speech or silence and adds that value to this queue.
// The decoder thread reads these and processes them.
// A critical section is used to make the queue thread-safe, and an event is used to
// show if the buffer has space or not.
// This very roughtly simulates the idea of doing features extraction on
// one thread and passes the feature stream to the decoder.
class CFrameQueue
{
public:
BOOL m_aFrames[100]; // The queue of speech/silence values
ULONG m_cFrames;
ULONG m_ulHeadIndex;
HANDLE m_hSpaceAvailEvent;
CRITICAL_SECTION m_cs;
CFrameQueue()
{
m_cFrames = 0;
m_ulHeadIndex = 0;
m_hSpaceAvailEvent = NULL;
InitializeCriticalSection(&m_cs);
}
~CFrameQueue()
{
DeleteCriticalSection(&m_cs);
}
void SetSpaceAvailEvent(HANDLE h)
{
m_hSpaceAvailEvent = h;
}
void InsertTail(BOOL b)
{
EnterCriticalSection(&m_cs);
ULONG ulTailIndex = (m_ulHeadIndex + m_cFrames) % sp_countof(m_aFrames);
m_aFrames[ulTailIndex] = b;
m_cFrames++;
if (m_cFrames == sp_countof(m_aFrames))
{
ResetEvent(m_hSpaceAvailEvent);
}
LeaveCriticalSection(&m_cs);
}
BOOL IsFull()
{
EnterCriticalSection(&m_cs);
BOOL b = (m_cFrames == sp_countof(m_aFrames));
LeaveCriticalSection(&m_cs);
return b;
}
BOOL RemoveHead()
{
EnterCriticalSection(&m_cs);
BOOL b = m_aFrames[m_ulHeadIndex];
m_ulHeadIndex = (m_ulHeadIndex + 1) % sp_countof(m_aFrames);
m_cFrames--;
SetEvent(m_hSpaceAvailEvent);
LeaveCriticalSection(&m_cs);
return b;
}
BOOL HasData()
{
EnterCriticalSection(&m_cs);
ULONG cFrames = m_cFrames;
LeaveCriticalSection(&m_cs);
return cFrames;
}
};
// Class so we can use CSpBasicQueue to store rule information
class CRuleEntry
{
public:
BOOL operator==(SPRULEHANDLE rh)
{
return (m_hRule == rh);
}
CRuleEntry * m_pNext;
SPRULEHANDLE m_hRule; // SAPI rule handle
BOOL m_fTopLevel; // Shows if rule can be activated
BOOL m_fActive; // Shows if rule is currectly active
};
// The main CSrEngine class
class ATL_NO_VTABLE CSrEngine :
public CComObjectRootEx<CComMultiThreadModel>,
public CComCoClass<CSrEngine, &CLSID_SampleSREngine>,
public ISpSREngine2,
public ISpObjectWithToken,
public ISpThreadTask
{
public:
CSrEngine() :
m_ulNextGrammarIndex(0),
m_cActive(0),
m_bPhraseStarted(FALSE),
m_bSoundStarted(FALSE),
m_hQueueHasRoom(NULL),
m_hRequestSync(NULL),
m_LangID(0)
{}
DECLARE_REGISTRY_RESOURCEID(IDR_SRENG)
DECLARE_PROTECT_FINAL_CONSTRUCT()
BEGIN_COM_MAP(CSrEngine)
COM_INTERFACE_ENTRY(ISpSREngine)
COM_INTERFACE_ENTRY(ISpSREngine2)
COM_INTERFACE_ENTRY(ISpObjectWithToken)
END_COM_MAP()
private:
HANDLE m_hRequestSync;
CFrameQueue m_FrameQueue;
ULONG m_cBlahBlah;
CSpBasicQueue<CDrvGrammar> m_GrammarList;
CSpBasicQueue<CContext> m_ContextList;
ULONG m_ulNextGrammarIndex;
ULONG m_cActive;
ULONGLONG m_ullStart;
ULONGLONG m_ullEnd;
BOOL m_bSoundStarted:1;
BOOL m_bPhraseStarted:1;
CComPtr<ISpSREngineSite> m_cpSite;
CComPtr<ISpThreadControl> m_cpDecoderThread;
HANDLE m_hQueueHasRoom;
CSpBasicQueue<CRuleEntry> m_RuleList;
CComPtr<ISpLexicon> m_cpLexicon;
CComPtr<ISpObjectToken> m_cpEngineObjectToken;
CComPtr<ISpObjectToken> m_cpUserObjectToken;
LANGID m_LangID;
public:
HRESULT RandomlyWalkRule(SPRECORESULTINFO * pResult, ULONG nWords, ULONGLONG ullAudioPos, ULONG ulAudioSize);
HRESULT RecurseWalk(SPSTATEHANDLE hState, SPPATHENTRY * pPath, ULONG * pcTrans);
HRESULT WalkCFGRule(SPRECORESULTINFO * pResult, ULONG cRulesActive, BOOL fHypothesis,
ULONG nWords, ULONGLONG ullAudioPos, ULONG ulAudioSize);
HRESULT WalkSLM(SPRECORESULTINFO * pResult, ULONG cSLMActive,
ULONG nWords, ULONGLONG ullAudioPos, ULONG ulAudioSize);
HRESULT WalkTextBuffer(void* pvGrammarCookie, SPPATHENTRY * pPath, SPTRANSITIONID hId, ULONG * pcTrans);
HRESULT AddEvent(SPEVENTENUM eEvent, ULONGLONG ullStreamPos, WPARAM wParam = 0, LPARAM lParam = 0);
HRESULT AddEventString(SPEVENTENUM eEvent, ULONGLONG ulLStreamPos, const WCHAR * psz, WPARAM = 0);
HRESULT CreatePhraseFromRule( CRuleEntry * pRule, BOOL fHypothesis,
ULONGLONG ullAudioPos, ULONG ulAudioSize,
ISpPhraseBuilder** ppPhrase );
CRuleEntry* FindRule( ULONG ulRuleIndex );
CRuleEntry* NextRuleAlt( CRuleEntry * pPriRule, CRuleEntry * pLastRule );
void _CheckRecognition();
void _NotifyRecognition(BOOL fHypothesis, ULONG nWords);
// ATL contstructor / destructor
HRESULT FinalConstruct();
HRESULT FinalRelease();
// Initialization methods
STDMETHODIMP SetObjectToken(ISpObjectToken * pToken);
STDMETHODIMP GetObjectToken(ISpObjectToken ** ppToken);
STDMETHODIMP SetRecoProfile(ISpObjectToken * pProfileToken);
STDMETHODIMP SetSite(ISpSREngineSite *pSite);
STDMETHODIMP GetInputAudioFormat(const GUID * pSrcFormatId, const WAVEFORMATEX * pSrcWFEX,
GUID * pDesiredFormatId, WAVEFORMATEX ** ppCoMemDesiredWFEX);
STDMETHODIMP OnCreateRecoContext(SPRECOCONTEXTHANDLE hSAPIRecoContext, void ** ppvDrvCtxt);
STDMETHODIMP OnDeleteRecoContext(void * pvDrvCtxt);
STDMETHODIMP OnCreateGrammar(void * pvEngineRecoContext,
SPGRAMMARHANDLE hSAPIGrammar,
void ** ppvEngineGrammar);
STDMETHODIMP OnDeleteGrammar(void * pvEngineGrammar);
// CFG methods
STDMETHODIMP WordNotify(SPCFGNOTIFY Action, ULONG cWords, const SPWORDENTRY * pWords);
STDMETHODIMP RuleNotify(SPCFGNOTIFY Action, ULONG cRules, const SPRULEENTRY * pRules);
// Proprietary grammar methods
// - used to implement an engine-specific grammar format
// - this sample does not implement these
STDMETHODIMP LoadProprietaryGrammar(void * pvEngineGrammar,
REFGUID rguidParam,
const WCHAR * pszStringParam,
const void * pvDataParam,
ULONG ulDataSize,
SPLOADOPTIONS Options)
{
return E_NOTIMPL;
}
STDMETHODIMP UnloadProprietaryGrammar(void * pvEngineGrammar)
{
return E_NOTIMPL;
}
STDMETHODIMP SetProprietaryRuleState(void * pvEngineGrammar,
const WCHAR * pszName,
void * pvReserved,
SPRULESTATE NewState,
ULONG * pcRulesChanged)
{
return E_NOTIMPL;
}
STDMETHODIMP SetProprietaryRuleIdState(void * pvEngineGrammar,
DWORD dwRuleId,
SPRULESTATE NewState)
{
return E_NOTIMPL;
}
// Since this engine does not support proprietary grammars, we do not need to implement
// this method other than just returning S_OK. Note to implementors: Do NOT return
// E_NOTIMPL. Just return S_OK and ignore this data if you do not need it to implement
// proprietary grammars.
STDMETHODIMP SetGrammarState(void * pvEngineGrammar, SPGRAMMARSTATE eGrammarState)
{
return S_OK;
}
STDMETHODIMP SetContextState(void * pvEngineContxt, SPCONTEXTSTATE eCtxtState)
{
return S_OK;
}
// Dictation methods
STDMETHODIMP LoadSLM(void * pvEngineGrammar, const WCHAR * pszTopicName);
STDMETHODIMP UnloadSLM(void * pvEngineGrammar);
STDMETHODIMP SetSLMState(void * pvEngineGrammar, SPRULESTATE NewState);
STDMETHODIMP IsPronounceable(void *pDrvGrammar, const WCHAR *pszWord, SPWORDPRONOUNCEABLE * pWordPronounceable);
STDMETHODIMP SetWordSequenceData(void * pvEngineGrammar, const WCHAR * pText, ULONG cchText, const SPTEXTSELECTIONINFO * pInfo);
STDMETHODIMP SetTextSelection(void * pvEngineGrammar, const SPTEXTSELECTIONINFO * pInfo);
STDMETHODIMP SetAdaptationData(void * pvEngineCtxtCookie, const WCHAR * pText, const ULONG cch);
// Property methods
STDMETHODIMP SetPropertyNum( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, LONG lValue );
STDMETHODIMP GetPropertyNum( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, LONG * plValue );
STDMETHODIMP SetPropertyString( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, const WCHAR* pValue );
STDMETHODIMP GetPropertyString( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, __deref_out_opt WCHAR** ppCoMemValue );
// The main recognition method
STDMETHODIMP RecognizeStream(REFGUID rguidFmtId, const WAVEFORMATEX * pWaveFormatEx,
HANDLE hRequestSync, HANDLE hDataAvailable,
HANDLE hExit, BOOL fNewAudioStream, BOOL fRealTimeAudio,
ISpObjectToken * pAudioObjectToken);
STDMETHODIMP PrivateCall(void * pvEngineContext, void * pCallFrame, ULONG ulCallFrameSize);
STDMETHODIMP PrivateCallEx(void * pvEngineContext, const void * pInCallFrame, ULONG ulCallFrameSize,
void ** ppvCoMemResponse, ULONG * pcbResponse);
// ISpThreadTask methods
STDMETHODIMP InitThread( void * pvTaskData, HWND hwnd )
{
return S_OK;
}
LRESULT STDMETHODCALLTYPE WindowMessage( void *pvTaskData, HWND hWnd, UINT Msg, WPARAM wParam, LPARAM lParam )
{
return E_UNEXPECTED;
}
STDMETHODIMP ThreadProc( void *pvTaskData, HANDLE hExitThreadEvent, HANDLE hNotifyEvent, HWND hwndWorker, volatile const BOOL * pfContinueProcessing );
// ISpSREngine2 methods
STDMETHODIMP PrivateCallImmediate(
void *pvEngineContext,
const void *pInCallFrame,
ULONG ulInCallFrameSize,
void **ppvCoMemResponse,
ULONG *pulResponseSize);
STDMETHODIMP SetAdaptationData2(
void *pvEngineContext,
__in_ecount(cch) const WCHAR *pAdaptationData,
const ULONG cch,
LPCWSTR pTopicName,
SPADAPTATIONSETTINGS eSettings,
SPADAPTATIONRELEVANCE eRelevance);
STDMETHODIMP SetGrammarPrefix(
void *pvEngineGrammar,
__in_opt LPCWSTR pszPrefix,
BOOL fIsPrefixRequired);
STDMETHODIMP SetRulePriority(
SPRULEHANDLE hRule,
void *pvClientRuleContext,
int nRulePriority);
STDMETHODIMP EmulateRecognition(
ISpPhrase *pPhrase,
DWORD dwCompareFlags);
STDMETHODIMP SetSLMWeight(
void *pvEngineGrammar,
float flWeight);
STDMETHODIMP SetRuleWeight(
SPRULEHANDLE hRule,
void *pvClientRuleContext,
float flWeight);
STDMETHODIMP SetTrainingState(
BOOL fDoingTraining,
BOOL fAdaptFromTrainingData);
STDMETHODIMP ResetAcousticModelAdaptation( void);
STDMETHODIMP OnLoadCFG(
void *pvEngineGrammar,
const SPBINARYGRAMMAR *pGrammarData,
ULONG ulGrammarID);
STDMETHODIMP OnUnloadCFG(
void *pvEngineGrammar,
ULONG ulGrammarID);
};