GameVsJam/3d Prototyp/Assets/Scripts/Text2Speech.cs

using UnityEngine;
using UnityEngine.Networking;
using System;
using System.IO;
using System.Text;
using Newtonsoft.Json;
using System.Collections;
using OpenAI_API;
using OpenAI_API.Chat;
using OpenAI_API.Models;
using System.Threading.Tasks;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;

[Serializable]
public class TextToSpeechResponse
{
    public string audioContent;
}


public class Text2Speech : MonoBehaviour
{
    public string context = "going for a walk, falling, explosion, blood";
    public string gender = "MALE";
    public JToken voice = null;
    public double speakingSpeed = 1.1;
    public bool playSound = false;
    public bool generate = false;

    private AudioSource _audioSource;

    private readonly string _googelCloudApiKey = "AIzaSyDNpkVyAUU4AvSwAErVMlZ1lSvGfpkEs0Q";
    private readonly string _google_CloudApiUrl = "https://texttospeech.googleapis.com/v1/text:synthesize";
    private readonly string _outputPath = "path/to/audio.wav";
    private string _tmpPath = "tmp_audio.wav";

    private OpenAIAPI _openAiApi;
    private Conversation? _conversation;
    private readonly string _openAiApiKey = "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";
    private readonly string _prompt = "Write a short text for an NPC in a game. The text should be based on the following bullet-point context, which describes the events of the last moments. Remember to only respond with the short text that the ONE NPC should speak! The context is: ";

    void Start()
    {
        _tmpPath = "tmp_audio_" + GetInstanceID().ToString() + ".wav";
        _audioSource = GetComponent<AudioSource>();
    }

    public void Update()
    {
        if (playSound)
        {
            playSound = false;
            _audioSource.Play();
        }
        if (generate)
        {
            generate = false;
            if (voice == null)
            {
                GetRandomGermanVoice(gender, (v) => {
                    voice = v;
                    //Debug.Log($"GoogleCloud: Choosen voice is\n{voice}");
                    StartCoroutine(GenerateAndSynthesizeText(context));
                });
            }
            else
            {
                StartCoroutine(GenerateAndSynthesizeText(context));
            }
        }
    }

    public void Generate(string c)
    {
        context = c;
        generate = true;
    }

    public void GetRandomGermanVoice(string gender, Action<JToken> callback)
    {
        StartCoroutine(GetRandomGermanVoiceCoroutine(gender, callback));
    }

    private IEnumerator GetRandomGermanVoiceCoroutine(string gender, Action<JToken> callback)
    {
        string url = $"https://texttospeech.googleapis.com/v1beta1/voices?key={_googelCloudApiKey}";

        using (UnityWebRequest webRequest = UnityWebRequest.Get(url))
        {
            yield return webRequest.SendWebRequest();

            if (webRequest.isNetworkError || webRequest.isHttpError)
            {
                Debug.LogError(webRequest.error);
                callback(null);
            }
            else
            {
                JObject response = JObject.Parse(webRequest.downloadHandler.text);
                JArray voices = (JArray)response["voices"];

                List<JToken> filteredVoices = new List<JToken>();

                // Filterung nach deutschen Stimmen und dem spezifizierten Geschlecht
                foreach (var v in voices)
                {
                    JArray languageCodes = (JArray)v["languageCodes"];
                    string languageCode = languageCodes[0].ToString();
                    string ssmlGender = v["ssmlGender"].ToString();
                    string name = v["name"].ToString();
                    if (languageCode.Contains("en-") && ssmlGender == gender && !name.Contains("Standard"))
                    {
                        filteredVoices.Add(v);
                    }
                }

                if (filteredVoices.Count > 0)
                {
                    // Auswahl einer zuf<75>lligen Stimme aus den gefilterten Ergebnissen
                    var randomVoice = filteredVoices[UnityEngine.Random.Range(0, filteredVoices.Count)];

                    System.Random random = new System.Random();
                    double u1 = 1.0 - random.NextDouble();
                    double u2 = 1.0 - random.NextDouble();
                    double randStdNormal = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Sin(2.0 * Math.PI * u2);
                    double mean = 0;
                    double stdDev = 4;
                    double randNormal = mean + stdDev * randStdNormal;
                    double clampedRandNormal = Math.Max(Math.Min(randNormal, 20), -20);
                    int finalResult = (int)Math.Round(clampedRandNormal);
                    randomVoice["pitch"] = finalResult;

                    callback(randomVoice);
                }
                else
                {
                    Debug.LogError("GoogleCloud: No matching voice found.");
                    callback(null);
                }
            }
        }
    }

    IEnumerator GenerateAndSynthesizeText(string context)
    {
        var generateTextTask = GenerateText(context);
  
        yield return new WaitUntil(() => generateTextTask.IsCompleted);

        if (generateTextTask.IsFaulted)
        {
            Debug.LogError(generateTextTask.Exception.ToString());
        }
        else
        {
            string chatPGTresponse = generateTextTask.Result;
            StartCoroutine(SynthesizeSpeech(chatPGTresponse));
        }
    }

    async Task<string> GenerateText(string context)
    {
        Model model = Model.ChatGPTTurbo;

        _openAiApi = new OpenAIAPI(_openAiApiKey);

        ChatRequest chatRequest = new ChatRequest
        {
            Temperature = 0.9,
            Model = model
        };

        _conversation = _openAiApi.Chat.CreateConversation(chatRequest);
        _conversation.AppendUserInput(_prompt + context);
        string response = await _conversation.GetResponseFromChatbotAsync();
        //Debug.Log($"ChatGPT: {response}");
        return response;
    }

    IEnumerator SynthesizeSpeech(string textToSynthesize)
    {
        var requestObject = new
        {
            input = new { text = textToSynthesize },
            voice = new { languageCode = ((JArray)voice["languageCodes"])[0].ToString(),
                          name = voice["name"],
                          ssmlGender = voice["ssmlGender"]},
            audioConfig = new { audioEncoding = "LINEAR16",
                                speakingRate = speakingSpeed,
                                pitch = voice["pitch"]
            }
        };

        string jsonRequestBody = JsonConvert.SerializeObject(requestObject);
        byte[] requestBody = Encoding.UTF8.GetBytes(jsonRequestBody);

        using (UnityWebRequest www = new UnityWebRequest(_google_CloudApiUrl + "?key=" + _googelCloudApiKey, "POST"))
        {
            www.uploadHandler = new UploadHandlerRaw(requestBody);
            www.downloadHandler = new DownloadHandlerBuffer();
            www.SetRequestHeader("Content-Type", "application/json");

            yield return www.SendWebRequest();

            if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError)
            {
                Debug.LogError("GoogleCloud: Error: " + www.error);
                Debug.LogError("GoogleCloud: Response: " + www.downloadHandler.text);
            }
            else
            {
                TextToSpeechResponse response = JsonConvert.DeserializeObject<TextToSpeechResponse>(www.downloadHandler.text);
                string audioContent = response.audioContent;
                SetAudioClip(audioContent);
                //Debug.Log("GoogleCloud: Successfully created WAV file");
            }
        }
    }

    private void SetAudioClip(string base64AudioContent)
    {
        byte[] audioBytes = Convert.FromBase64String(base64AudioContent);

        string tempFilePath = Path.Combine(Application.temporaryCachePath, _tmpPath);
        File.WriteAllBytes(tempFilePath, audioBytes);

        StartCoroutine(LoadWavAudio(tempFilePath));
    }

    IEnumerator LoadWavAudio(string path)
    {
        using (UnityWebRequest www = UnityWebRequestMultimedia.GetAudioClip("file:///" + path, AudioType.WAV))
        {
            yield return www.SendWebRequest();

            if (www.result == UnityWebRequest.Result.ConnectionError || www.result == UnityWebRequest.Result.ProtocolError)
            {
                Debug.LogError("Fehler beim Laden des AudioClips: " + www.error);
            }
            else
            {
                _audioSource.clip = DownloadHandlerAudioClip.GetContent(www);
                //Debug.Log("GoogleCloud: Successfully set WAV file as AudioClip");
                _audioSource.Play();
            }
        }
    }


    private void SaveAudioFile(string base64AudioContent)
    {
        byte[] audioBytes = Convert.FromBase64String(base64AudioContent);
        File.WriteAllBytes(_outputPath, audioBytes);
        Debug.Log($"GoogleCloud: Successfully saved WAV file as {_outputPath}");
    }
}