From 1a255f1182275f950a22cc98bb3f6e2c12d8f183 Mon Sep 17 00:00:00 2001 From: klappstuhl24 Date: Thu, 4 Apr 2024 11:10:57 +0200 Subject: [PATCH] text2speech --- 3d Prototyp/Assets/Scripts/Text2Speech.cs | 252 ++++++++++++++++++ .../Assets/Scripts/Text2Speech.cs.meta | 11 + 2 files changed, 263 insertions(+) create mode 100644 3d Prototyp/Assets/Scripts/Text2Speech.cs create mode 100644 3d Prototyp/Assets/Scripts/Text2Speech.cs.meta diff --git a/3d Prototyp/Assets/Scripts/Text2Speech.cs b/3d Prototyp/Assets/Scripts/Text2Speech.cs new file mode 100644 index 00000000..4ad4e1b4 --- /dev/null +++ b/3d Prototyp/Assets/Scripts/Text2Speech.cs @@ -0,0 +1,252 @@ +using UnityEngine; +using UnityEngine.Networking; +using System; +using System.IO; +using System.Text; +using Newtonsoft.Json; +using System.Collections; +using OpenAI_API; +using OpenAI_API.Chat; +using OpenAI_API.Models; +using System.Threading.Tasks; +using Newtonsoft.Json.Linq; +using System.Collections.Generic; + +[Serializable] +public class TextToSpeechResponse +{ + public string audioContent; +} + + +public class Text2Speech : MonoBehaviour +{ + public string context = "going for a walk, falling, explosion, blood"; + public string gender = "MALE"; + public JToken voice = null; + public double speakingSpeed = 1.1; + public bool playSound = false; + public bool generate = false; + + private AudioSource _audioSource; + + private readonly string _googelCloudApiKey = "AIzaSyDNpkVyAUU4AvSwAErVMlZ1lSvGfpkEs0Q"; + private readonly string _google_CloudApiUrl = "https://texttospeech.googleapis.com/v1/text:synthesize"; + private readonly string _outputPath = "C:\\Users\\PC\\VoiceTest\\Assets\\Scirpts\\audio.wav"; + private string _tmpPath = "tmp_audio.wav"; + + private OpenAIAPI _openAiApi; + private Conversation? _conversation; + private readonly string _openAiApiKey = "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"; + private readonly string _prompt = "Write a short text for an NPC in a game.The text should be based on the following bullet-point context, which describes the events of the last moments. Remember to only respond with the short text that the ONE NPC should speak! The context is: "; + + void Start() + { + _tmpPath = "tmp_audio_" + GetInstanceID().ToString() + ".wav"; + _audioSource = GetComponent(); + } + + public void Update() + { + if (playSound) + { + playSound = false; + _audioSource.Play(); + } + if (generate) + { + generate = false; + if (voice == null) + { + GetRandomGermanVoice(gender, (v) => { + voice = v; + //Debug.Log($"GoogleCloud: Choosen voice is\n{voice}"); + StartCoroutine(GenerateAndSynthesizeText(context)); + }); + } + else + { + StartCoroutine(GenerateAndSynthesizeText(context)); + } + } + } + + public void GetRandomGermanVoice(string gender, Action callback) + { + StartCoroutine(GetRandomGermanVoiceCoroutine(gender, callback)); + } + + private IEnumerator GetRandomGermanVoiceCoroutine(string gender, Action callback) + { + string url = $"https://texttospeech.googleapis.com/v1beta1/voices?key={_googelCloudApiKey}"; + + using (UnityWebRequest webRequest = UnityWebRequest.Get(url)) + { + yield return webRequest.SendWebRequest(); + + if (webRequest.isNetworkError || webRequest.isHttpError) + { + Debug.LogError(webRequest.error); + callback(null); + } + else + { + JObject response = JObject.Parse(webRequest.downloadHandler.text); + JArray voices = (JArray)response["voices"]; + + List filteredVoices = new List(); + + // Filterung nach deutschen Stimmen und dem spezifizierten Geschlecht + foreach (var v in voices) + { + JArray languageCodes = (JArray)v["languageCodes"]; + string languageCode = languageCodes[0].ToString(); + string ssmlGender = v["ssmlGender"].ToString(); + string name = v["name"].ToString(); + if (languageCode.Contains("en-") && ssmlGender == gender && !name.Contains("Standard")) + { + filteredVoices.Add(v); + } + } + + if (filteredVoices.Count > 0) + { + // Auswahl einer zufälligen Stimme aus den gefilterten Ergebnissen + var randomVoice = filteredVoices[UnityEngine.Random.Range(0, filteredVoices.Count)]; + + System.Random random = new System.Random(); + double u1 = 1.0 - random.NextDouble(); + double u2 = 1.0 - random.NextDouble(); + double randStdNormal = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Sin(2.0 * Math.PI * u2); + double mean = 0; + double stdDev = 4; + double randNormal = mean + stdDev * randStdNormal; + double clampedRandNormal = Math.Max(Math.Min(randNormal, 20), -20); + int finalResult = (int)Math.Round(clampedRandNormal); + randomVoice["pitch"] = finalResult; + + callback(randomVoice); + } + else + { + Debug.LogError("GoogleCloud: No matching voice found."); + callback(null); + } + } + } + } + + IEnumerator GenerateAndSynthesizeText(string context) + { + var generateTextTask = GenerateText(context); + + yield return new WaitUntil(() => generateTextTask.IsCompleted); + + if (generateTextTask.IsFaulted) + { + Debug.LogError(generateTextTask.Exception.ToString()); + } + else + { + string chatPGTresponse = generateTextTask.Result; + StartCoroutine(SynthesizeSpeech(chatPGTresponse)); + } + } + + async Task GenerateText(string context) + { + Model model = Model.ChatGPTTurbo; + + _openAiApi = new OpenAIAPI(_openAiApiKey); + + ChatRequest chatRequest = new ChatRequest + { + Temperature = 0.9, + Model = model + }; + + _conversation = _openAiApi.Chat.CreateConversation(chatRequest); + _conversation.AppendUserInput(_prompt + context); + string response = await _conversation.GetResponseFromChatbotAsync(); + //Debug.Log($"ChatGPT: {response}"); + return response; + } + + IEnumerator SynthesizeSpeech(string textToSynthesize) + { + var requestObject = new + { + input = new { text = textToSynthesize }, + voice = new { languageCode = ((JArray)voice["languageCodes"])[0].ToString(), + name = voice["name"], + ssmlGender = voice["ssmlGender"]}, + audioConfig = new { audioEncoding = "LINEAR16", + speakingRate = speakingSpeed, + pitch = voice["pitch"] + } + }; + + string jsonRequestBody = JsonConvert.SerializeObject(requestObject); + byte[] requestBody = Encoding.UTF8.GetBytes(jsonRequestBody); + + using (UnityWebRequest www = new UnityWebRequest(_google_CloudApiUrl + "?key=" + _googelCloudApiKey, "POST")) + { + www.uploadHandler = new UploadHandlerRaw(requestBody); + www.downloadHandler = new DownloadHandlerBuffer(); + www.SetRequestHeader("Content-Type", "application/json"); + + yield return www.SendWebRequest(); + + if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError) + { + Debug.LogError("GoogleCloud: Error: " + www.error); + Debug.LogError("GoogleCloud: Response: " + www.downloadHandler.text); + } + else + { + TextToSpeechResponse response = JsonConvert.DeserializeObject(www.downloadHandler.text); + string audioContent = response.audioContent; + SetAudioClip(audioContent); + //Debug.Log("GoogleCloud: Successfully created WAV file"); + } + } + } + + private void SetAudioClip(string base64AudioContent) + { + byte[] audioBytes = Convert.FromBase64String(base64AudioContent); + + string tempFilePath = Path.Combine(Application.temporaryCachePath, _tmpPath); + File.WriteAllBytes(tempFilePath, audioBytes); + + StartCoroutine(LoadWavAudio(tempFilePath)); + SaveAudioFile(base64AudioContent); + } + + IEnumerator LoadWavAudio(string path) + { + using (UnityWebRequest www = UnityWebRequestMultimedia.GetAudioClip("file:///" + path, AudioType.WAV)) + { + yield return www.SendWebRequest(); + + if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError) + { + Debug.LogError("Fehler beim Laden des AudioClips: " + www.error); + } + else + { + _audioSource.clip = DownloadHandlerAudioClip.GetContent(www); + //Debug.Log("GoogleCloud: Successfully set WAV file as AudioClip"); + _audioSource.Play(); + } + } + } + + + private void SaveAudioFile(string base64AudioContent) + { + byte[] audioBytes = Convert.FromBase64String(base64AudioContent); + File.WriteAllBytes(_outputPath, audioBytes); + Debug.Log($"GoogleCloud: Successfully saved WAV file as {_outputPath}"); + } +} diff --git a/3d Prototyp/Assets/Scripts/Text2Speech.cs.meta b/3d Prototyp/Assets/Scripts/Text2Speech.cs.meta new file mode 100644 index 00000000..1dbd1e73 --- /dev/null +++ b/3d Prototyp/Assets/Scripts/Text2Speech.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: ef5183dac70a54b4cbed3e05d617524f +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: