pickel-cancer-rick/project-cancer-classificati...

1116 lines
174 KiB
Plaintext
Raw Permalink Normal View History

2024-01-04 14:47:29 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Laden der Rohdaten"
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": 1,
2024-01-04 14:47:29 +01:00
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"# Laden der 'kirp' Liste aus der Pickle-Datei\n",
"with open('rick.pickle', 'rb') as f:\n",
" data_frame = pickle.load(f)"
]
},
2024-01-04 15:06:10 +01:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Aktiviere Cuda Support"
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": 2,
2024-01-04 15:06:10 +01:00
"metadata": {},
2024-01-08 15:19:52 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CUDA is available on your system.\n"
]
}
],
2024-01-04 15:06:10 +01:00
"source": [
"import torch\n",
"device = \"cpu\"\n",
"if torch.cuda.is_available():\n",
" print(\"CUDA is available on your system.\")\n",
" device = \"cuda\"\n",
"else:\n",
" print(\"CUDA is not available on your system.\")"
]
},
2024-01-04 14:47:29 +01:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PCA Klasse zu Reduktion der Dimensionen"
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": 20,
2024-01-04 14:47:29 +01:00
"metadata": {},
"outputs": [],
"source": [
"from torch.utils.data import Dataset\n",
"import torch\n",
"import pandas as pd\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.model_selection import train_test_split\n",
2024-01-05 15:16:59 +01:00
"from typing import List, Tuple, Dict\n",
2024-01-04 14:47:29 +01:00
"\n",
"\n",
"class GenomeDataset(Dataset):\n",
" \"\"\"\n",
" Eine benutzerdefinierte Dataset-Klasse, die für die Handhabung von Genomdaten konzipiert ist.\n",
" Diese Klasse wendet eine Principal Component Analysis (PCA) auf die Frequenzen der Genome an\n",
" und teilt den Datensatz in Trainings- und Validierungsteile auf.\n",
"\n",
" Attributes:\n",
" dataframe (pd.DataFrame): Ein Pandas DataFrame, der die initialen Daten enthält.\n",
" train_df (pd.DataFrame): Ein DataFrame, der den Trainingsdatensatz nach der Anwendung von PCA und der Aufteilung enthält.\n",
" val_df (pd.DataFrame): Ein DataFrame, der den Validierungsdatensatz nach der Anwendung von PCA und der Aufteilung enthält.\n",
"\n",
" Methods:\n",
" __init__(self, dataframe, n_pca_components=1034, train_size=0.8, split_random_state=42):\n",
" Konstruktor für die GenomeDataset Klasse.\n",
" _do_PCA(self, frequencies, n_components=1034):\n",
" Wendet PCA auf die gegebenen Frequenzen an.\n",
" _split_dataset(self, train_size=0.8, random_state=42):\n",
" Teilt den DataFrame in Trainings- und Validierungsdatensätze auf.\n",
" __getitem__(self, index):\n",
" Gibt ein Tupel aus transformierten Frequenzen und dem zugehörigen Krebstyp für einen gegebenen Index zurück.\n",
" __len__(self):\n",
" Gibt die Gesamtlänge der kombinierten Trainings- und Validierungsdatensätze zurück.\n",
" \"\"\"\n",
"\n",
" def __init__(self, dataframe: pd.DataFrame, n_pca_components: int = 1034, train_size: float = 0.8, split_random_state: int = 42):\n",
" \"\"\"\n",
" Konstruktor für die GenomeDataset Klasse.\n",
"\n",
" Parameters:\n",
" dataframe (pd.DataFrame): Der DataFrame, der die Genome Frequenzen und Krebsarten enthält.\n",
" n_pca_components (int): Die Anzahl der PCA-Komponenten, auf die reduziert werden soll. Standardwert ist 1034.\n",
" train_size (float): Der Anteil der Daten, der als Trainingsdaten verwendet werden soll. Standardwert ist 0.8.\n",
" split_random_state (int): Der Zufalls-Saatwert, der für die Aufteilung des Datensatzes verwendet wird. Standardwert ist 42.\n",
" \"\"\"\n",
" self.dataframe = dataframe\n",
"\n",
" # Umwandlung der Krebsarten in numerische Werte\n",
" self.label_encoder = LabelEncoder()\n",
" self.dataframe['encoded_cancer_type'] = self.label_encoder.fit_transform(dataframe['cancer_type'])\n",
"\n",
" # Anwenden der PCA auf die Frequenzen\n",
" self.dataframe['pca_frequencies'] = self._do_PCA(self.dataframe['genome_frequencies'].tolist(), n_pca_components)\n",
"\n",
" # Teilen des DataFrame in Trainings- und Validierungsdatensatz\n",
" self._split_dataset(train_size=train_size, random_state=split_random_state)\n",
"\n",
" def transform_datapoint(self, datapoint: List[float]) -> List[float]:\n",
" \"\"\"\n",
" Transformiert einen einzelnen Datenpunkt durch Standardisierung und Anwendung der PCA.\n",
"\n",
" Diese Methode nimmt einen rohen Datenpunkt (eine Liste von Frequenzen), standardisiert ihn mit dem \n",
" zuvor angepassten Scaler und wendet dann die PCA-Transformation an, um ihn in den reduzierten \n",
" Feature-Raum zu überführen, der für das Training des Modells verwendet wurde.\n",
"\n",
" Parameters:\n",
" datapoint (List[float]): Ein roher Datenpunkt, bestehend aus einer Liste von Frequenzen.\n",
"\n",
" Returns:\n",
" List[float]: Der transformierte Datenpunkt, nach Anwendung der Standardisierung und der PCA.\n",
" \"\"\"\n",
" # Standardisierung des Datenpunkts\n",
" scaled_data_point = self.scaler.transform([datapoint])\n",
"\n",
" # PCA-Transformation des standardisierten Datenpunkts\n",
" pca_transformed_point = self.pca.transform(scaled_data_point)\n",
"\n",
" return pca_transformed_point.tolist()\n",
"\n",
" def _do_PCA(self, frequencies: List[List[float]], n_components: int = 1034) -> List[List[float]]:\n",
" \"\"\"\n",
" Wendet PCA auf die gegebenen Frequenzen an.\n",
"\n",
" Parameters:\n",
" frequencies (List[List[float]]): Die Liste der Frequenzen, auf die die PCA angewendet werden soll.\n",
" n_components (int): Die Anzahl der Komponenten für die PCA. Standardwert ist 1034.\n",
"\n",
" Returns:\n",
" List[List[float]]: Eine Liste von Listen, die die transformierten Frequenzen nach der PCA darstellt.\n",
" \"\"\"\n",
"\n",
" # Standardisieren der Frequenzen\n",
" self.scaler = StandardScaler()\n",
" scaled_frequencies = self.scaler.fit_transform(frequencies)\n",
"\n",
" # PCA-Instanz erstellen und auf die gewünschte Anzahl von Komponenten reduzieren\n",
" self.pca = PCA(n_components=n_components)\n",
"\n",
" # PCA auf die Frequenzen anwenden\n",
" pca_result = self.pca.fit_transform(scaled_frequencies)\n",
"\n",
" return pca_result.tolist()\n",
"\n",
2024-01-08 15:19:52 +01:00
" def _split_dataset(self, train_size: float = 0.9, random_state: int = 42):\n",
2024-01-04 14:47:29 +01:00
" \"\"\"\n",
" Teilt den DataFrame in Trainings- und Validierungsdatensätze auf.\n",
"\n",
" Parameters:\n",
" train_size (float): Der Anteil der Daten, der als Trainingsdaten verwendet werden soll.\n",
" random_state (int): Der Zufalls-Saatwert, der für die Aufteilung des Datensatzes verwendet wird.\n",
" \"\"\"\n",
"\n",
" class SplittedDataset(Dataset):\n",
" def __init__(self, dataframe):\n",
" self.dataframe = dataframe\n",
"\n",
" # Umwandlung der Genome Frequenzen in Tensoren\n",
" self.genome_frequencies = torch.tensor(dataframe['pca_frequencies'].tolist(), dtype=torch.float32)\n",
"\n",
" # Umwandlung der Krebsarten in numerische Werte\n",
" self.label_encoder = LabelEncoder()\n",
" self.cancer_types = torch.tensor(dataframe['encoded_cancer_type'].tolist(), dtype=torch.long)\n",
"\n",
" def __getitem__(self, index):\n",
" # Rückgabe eines Tupels aus Genome Frequenzen und dem entsprechenden Krebstyp\n",
" return self.genome_frequencies[index], self.cancer_types[index]\n",
"\n",
" def __len__(self):\n",
" return len(self.dataframe)\n",
"\n",
" # Teilen des DataFrame in Trainings- und Validierungsdatensatz\n",
2024-01-05 15:16:59 +01:00
" train_df, val_df = train_test_split(self.dataframe, train_size=train_size) #, random_state=random_state)\n",
2024-01-04 14:47:29 +01:00
" self.train_df = SplittedDataset(train_df)\n",
" self.val_df = SplittedDataset(val_df)\n",
"\n",
"\n",
" def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:\n",
" \"\"\"\n",
" Gibt ein Tupel aus transformierten Frequenzen und dem entsprechenden Krebstyp für einen gegebenen Index zurück.\n",
"\n",
" Parameters:\n",
" index (int): Der Index des zu abrufenden Datenelements.\n",
"\n",
" Returns:\n",
" Tuple[torch.Tensor, int]: Ein Tupel, bestehend aus einem Tensor der transformierten Frequenzen und dem zugehörigen Krebstyp.\n",
" \"\"\"\n",
"\n",
" print(self.train_df.shape)\n",
" print(self.val_df.shape)\n",
" \n",
" if index < len(self.train_df):\n",
" row = self.train_df.iloc[index]\n",
" else:\n",
" row = self.val_df.iloc[len(self.train_df) - index]\n",
"\n",
" pca_frequencies_tensor = torch.tensor(row['pca_frequencies'], dtype=torch.float32)\n",
" cancer_type = row['encoded_cancer_type']\n",
"\n",
" return pca_frequencies_tensor, cancer_type\n",
"\n",
" def __len__(self) -> int:\n",
" \"\"\"\n",
" Gibt die Gesamtlänge der kombinierten Trainings- und Validierungsdatensätze zurück.\n",
"\n",
" Returns:\n",
" int: Die Länge der kombinierten Datensätze.\n",
" \"\"\"\n",
" \n",
" return len(self.train_df) + len(self.val_df)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Definition des neuronalen Netzes"
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": 21,
2024-01-04 14:47:29 +01:00
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.nn.functional as F\n",
2024-01-08 14:20:49 +01:00
"from sklearn.metrics import confusion_matrix\n",
2024-01-04 14:47:29 +01:00
"\n",
"class CancerClassifierNN(nn.Module):\n",
" \"\"\"\n",
" Eine benutzerdefinierte neuronale Netzwerkklassifikator-Klasse für die Krebsklassifikation.\n",
"\n",
" Diese Klasse definiert ein mehrschichtiges Perzeptron (MLP), das für die Klassifizierung von Krebsarten\n",
" anhand genetischer Frequenzdaten verwendet wird.\n",
"\n",
" Attributes:\n",
" fc1 (nn.Linear): Die erste lineare Schicht des Netzwerks.\n",
" fc2 (nn.Linear): Die zweite lineare Schicht des Netzwerks.\n",
" fc3 (nn.Linear): Die dritte lineare Schicht des Netzwerks.\n",
" fc4 (nn.Linear): Die Ausgabeschicht des Netzwerks.\n",
" dropout (nn.Dropout): Ein Dropout-Layer zur Vermeidung von Overfitting.\n",
"\n",
" Methods:\n",
" __init__(self, input_size: int, num_classes: int):\n",
" Konstruktor für die CancerClassifierNN Klasse.\n",
" forward(self, x: torch.Tensor) -> torch.Tensor:\n",
" Definiert den Vorwärtsdurchlauf des Netzwerks.\n",
" \"\"\"\n",
"\n",
" def __init__(self, input_size: int, num_classes: int):\n",
" \"\"\"\n",
" Konstruktor für die CancerClassifierNN Klasse.\n",
"\n",
" Parameters:\n",
" input_size (int): Die Größe des Input-Features.\n",
" num_classes (int): Die Anzahl der Zielklassen.\n",
" \"\"\"\n",
" super(CancerClassifierNN, self).__init__()\n",
" # Definieren der Schichten\n",
2024-01-05 13:19:38 +01:00
" self.fc1 = nn.Linear(input_size, input_size) # Eingabeschicht\n",
" self.fc2 = nn.Linear(input_size, input_size//2) # Versteckte Schicht\n",
" self.fc3 = nn.Linear(input_size//2, input_size//4) # Weitere versteckte Schicht\n",
" self.fc4 = nn.Linear(input_size//4, num_classes) # Ausgabeschicht\n",
2024-01-04 14:47:29 +01:00
" self.dropout = nn.Dropout(p=0.5) # Dropout\n",
"\n",
" def forward(self, x: torch.Tensor) -> torch.Tensor:\n",
" \"\"\"\n",
" Definiert den Vorwärtsdurchlauf des Netzwerks.\n",
"\n",
" Parameters:\n",
" x (torch.Tensor): Der Input-Tensor für das Netzwerk.\n",
"\n",
" Returns:\n",
" torch.Tensor: Der Output-Tensor nach dem Durchlauf durch das Netzwerk.\n",
" \"\"\"\n",
" x = F.relu(self.fc1(x))\n",
" x = self.dropout(x)\n",
" x = F.relu(self.fc2(x))\n",
" x = self.dropout(x)\n",
" x = F.relu(self.fc3(x))\n",
2024-01-05 13:19:38 +01:00
" x = self.dropout(x)\n",
" x = torch.softmax(self.fc4(x), dim=1) # Oder F.log_softmax(x, dim=1) für Mehrklassenklassifikation\n",
2024-01-08 15:19:52 +01:00
" return x"
2024-01-04 14:47:29 +01:00
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": 22,
2024-01-04 14:47:29 +01:00
"metadata": {},
"outputs": [],
"source": [
"from torch.utils.data import DataLoader\n",
"import torch.optim as optim\n",
2024-01-05 15:16:59 +01:00
"from IPython.display import clear_output\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import os\n",
2024-01-05 15:46:19 +01:00
"import pickle\n",
2024-01-05 15:16:59 +01:00
"\n",
"class ExperimentationalExperiments():\n",
2024-01-05 15:56:33 +01:00
" \"\"\"\n",
" Diese Klasse dient zur Durchführung und Verwaltung von Experimenten im Rahmen\n",
" des maschinellen Lernens, insbesondere für die Krebsklassifikation.\n",
"\n",
" Attribute:\n",
" results : Dict\n",
" Speichert die Ergebnisse der durchgeführten Experimente.\n",
" \"\"\"\n",
2024-01-05 15:16:59 +01:00
"\n",
" def __init__(self) -> None:\n",
2024-01-05 15:56:33 +01:00
" \"\"\" Konstruktor der Klasse. Initialisiert 'results' als None. \"\"\"\n",
2024-01-05 15:16:59 +01:00
" self.results = None\n",
"\n",
" def run_single_experiment(self, train_loader: DataLoader, valid_loader: DataLoader, n_pca_components: int, n_epochs: int = 200, learning_rate: int = 0.0005, verbose: bool = True, experiment_num: int = None) -> Tuple:\n",
2024-01-05 15:56:33 +01:00
" \"\"\"\n",
" Führt ein einzelnes Experiment mit dem spezifizierten DataLoader, PCA-Komponenten und weiteren Parametern durch.\n",
"\n",
" Parameter:\n",
" train_loader : DataLoader\n",
" Der DataLoader für den Trainingsdatensatz.\n",
" valid_loader : DataLoader\n",
" Der DataLoader für den Validierungsdatensatz.\n",
" n_pca_components : int\n",
" Anzahl der PCA-Komponenten, die im Modell verwendet werden.\n",
" n_epochs : int, optional\n",
" Anzahl der Epochen für das Training (Standardwert ist 200).\n",
" learning_rate : float, optional\n",
" Lernrate für den Optimierer (Standardwert ist 0.0005).\n",
" verbose : bool, optional\n",
" Gibt an, ob der Trainingsfortschritt angezeigt werden soll (Standardwert ist True).\n",
" experiment_num : int, optional\n",
" Nummer des Experiments.\n",
"\n",
" Rückgabewerte:\n",
" Tuple\n",
" Ein Tupel bestehend aus Listen der Trainings- und Validierungsverluste sowie der Genauigkeiten.\n",
" \"\"\"\n",
2024-01-05 15:16:59 +01:00
" if not isinstance(n_pca_components, int):\n",
" raise TypeError(\"n_pca_components must be an integers!\")\n",
2024-01-08 15:19:52 +01:00
" \n",
" plt.ioff()\n",
2024-01-05 15:16:59 +01:00
"\n",
" model = CancerClassifierNN(input_size=n_pca_components, num_classes=3)\n",
" model.to(device=device)\n",
"\n",
" # Verlustfunktion\n",
" criterion = nn.CrossEntropyLoss()\n",
" # Optimierer\n",
" optimizer = optim.Adam(model.parameters(), lr=learning_rate)\n",
"\n",
" # Listen, um Verluste zu speichern\n",
" train_losses = []\n",
" valid_losses = []\n",
" train_accuracies = []\n",
" valid_accuracies = []\n",
"\n",
" for epoch in range(n_epochs):\n",
" model.train()\n",
" train_loss = 0.0\n",
" correct_predictions = 0\n",
" total_predictions = 0\n",
"\n",
" for i, (inputs, labels) in enumerate(train_loader):\n",
" inputs, labels = inputs.to(device), labels.to(device)\n",
" optimizer.zero_grad()\n",
" outputs = model(inputs)\n",
" loss = criterion(outputs, labels)\n",
" loss.backward()\n",
" optimizer.step()\n",
" train_loss += loss.item()\n",
"\n",
" # Berechnen der Genauigkeit\n",
" _, predicted = torch.max(outputs, 1)\n",
" correct_predictions += (predicted == labels).sum().item()\n",
" total_predictions += labels.size(0)\n",
"\n",
" # Durchschnittlicher Trainingsverlust und Genauigkeit\n",
" train_loss /= len(train_loader)\n",
" train_accuracy = correct_predictions / total_predictions\n",
" train_losses.append(train_loss)\n",
" train_accuracies.append(train_accuracy)\n",
"\n",
" # Validierungsverlust und Genauigkeit\n",
" model.eval()\n",
" valid_loss = 0.0\n",
" correct_predictions = 0\n",
" total_predictions = 0\n",
"\n",
" with torch.no_grad():\n",
" for inputs, labels in valid_loader:\n",
" inputs, labels = inputs.to(device), labels.to(device)\n",
" outputs = model(inputs)\n",
" loss = criterion(outputs, labels)\n",
" valid_loss += loss.item()\n",
"\n",
" # Berechnen der Genauigkeit\n",
" _, predicted = torch.max(outputs, 1)\n",
" correct_predictions += (predicted == labels).sum().item()\n",
" total_predictions += labels.size(0)\n",
"\n",
" # Durchschnittlicher Validierungsverlust und Genauigkeit\n",
" valid_loss /= len(valid_loader)\n",
" valid_accuracy = correct_predictions / total_predictions\n",
" valid_losses.append(valid_loss)\n",
" valid_accuracies.append(valid_accuracy)\n",
"\n",
2024-01-08 15:19:52 +01:00
" # Fortschritt anzeigen, falls angegeben\n",
" if verbose:\n",
" clear_output(wait=True)\n",
" print(f'Experiment #{experiment_num} mit {n_pca_components} PCA components: Epoch [{epoch+1}/{n_epochs}], Trainingsverlust: {train_loss:.4f}, Trainingsgenauigkeit: {train_accuracies[-1]:.4f}, Validierungsverlust: {valid_loss:.4f}, Validierungsgenauigkeit: {valid_accuracies[-1]:.4f}')\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-08 15:19:52 +01:00
" # Aktualisieren des Graphen\n",
" clear_output(wait=True)\n",
" fig, ax1 = plt.subplots()\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-08 15:19:52 +01:00
" # Zeichnen der Verlustkurven\n",
" ax1.plot(train_losses, label='Trainingsverlust', color='r')\n",
" ax1.plot(valid_losses, label='Validierungsverlust', color='b')\n",
" ax1.set_xlabel('Epochen')\n",
" ax1.set_ylabel('Verlust', color='g')\n",
" ax1.tick_params(axis='y', labelcolor='g')\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-08 15:19:52 +01:00
" # Zweite y-Achse für die Genauigkeit\n",
" ax2 = ax1.twinx()\n",
" ax2.plot(train_accuracies, label='Trainingsgenauigkeit', color='r', linestyle='dashed')\n",
" ax2.plot(valid_accuracies, label='Validierungsgenauigkeit', color='b', linestyle='dashed')\n",
" ax2.set_ylabel('Genauigkeit', color='g')\n",
" ax2.tick_params(axis='y', labelcolor='g')\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-08 15:19:52 +01:00
" # Titel und Legende\n",
" plt.title(f'Experiment #{experiment_num}: Trainings- und Validierungsverlust und -genauigkeit über die Zeit mit \\n{n_pca_components}-Hauptkomponenten, Lernrate: {learning_rate}')\n",
" fig.tight_layout()\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-08 15:19:52 +01:00
" # Legende außerhalb des Graphen\n",
" ax1.legend(loc='upper left', bbox_to_anchor=(1.15, 1))\n",
" ax2.legend(loc='upper left', bbox_to_anchor=(1.15, 0.85))\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-05 15:46:19 +01:00
" # Plot speichern\n",
2024-01-08 15:19:52 +01:00
" if experiment_num > 0:\n",
" name = str(experiment_num) + \".png\" if experiment_num is not None else \"single_experiment.png\"\n",
" if not os.path.exists(\"Experiments\"):\n",
" os.makedirs(\"Experiments\")\n",
" if not os.path.exists(f\"Experiments/{str(n_pca_components)}\"):\n",
" os.makedirs(f\"Experiments/{str(n_pca_components)}\")\n",
" plt.savefig(f\"Experiments/{str(n_pca_components)}/{name}\", bbox_inches='tight')\n",
2024-01-05 15:16:59 +01:00
"\n",
2024-01-08 15:19:52 +01:00
" plt.ion()\n",
"\n",
" return model, (train_losses, valid_losses, train_accuracies, valid_accuracies)\n",
2024-01-05 15:16:59 +01:00
"\n",
" def run_single_pca_experiment(self, train_loader: DataLoader, valid_loader: DataLoader, n_pca_components: int, n_experiments: int, n_epochs: int = 200, learning_rate: int = 0.0005, verbose: bool = True) -> List:\n",
2024-01-05 15:56:33 +01:00
" \"\"\"\n",
" Führt eine Serie von Experimenten mit verschiedenen Konfigurationen für die PCA-Komponenten durch.\n",
"\n",
" Parameter:\n",
" train_loader : DataLoader\n",
" Der DataLoader für den Trainingsdatensatz.\n",
" valid_loader : DataLoader\n",
" Der DataLoader für den Validierungsdatensatz.\n",
" n_pca_components : int\n",
" Anzahl der PCA-Komponenten, die im Modell verwendet werden.\n",
" n_experiments : int\n",
" Anzahl der durchzuführenden Experimente.\n",
" n_epochs : int, optional\n",
" Anzahl der Epochen für das Training (Standardwert ist 200).\n",
" learning_rate : float, optional\n",
" Lernrate für den Optimierer (Standardwert ist 0.0005).\n",
" verbose : bool, optional\n",
" Gibt an, ob der Trainingsfortschritt angezeigt werden soll (Standardwert ist True).\n",
"\n",
" Rückgabewerte:\n",
" List\n",
" Eine Liste von Ergebnissen der einzelnen Experimente.\n",
" \"\"\"\n",
2024-01-05 15:16:59 +01:00
" if not isinstance(n_pca_components, int):\n",
" raise TypeError(\"n_pca_components must be an integers!\")\n",
"\n",
" results = []\n",
"\n",
" for n in range(n_experiments):\n",
2024-01-08 15:19:52 +01:00
" _, res = self.run_single_experiment(train_loader, valid_loader, n_pca_components, n_epochs=n_epochs, learning_rate=learning_rate, verbose=verbose, experiment_num=n+1)\n",
2024-01-05 15:16:59 +01:00
" results.append(res)\n",
"\n",
" return results\n",
" \n",
"\n",
" def run(self, n_pca_components: List[int], n_experiments: int, n_epochs: int = 200, learning_rate: int = 0.0005, batch_size: int = 64, verbose: bool = True) -> Dict:\n",
2024-01-05 15:56:33 +01:00
" \"\"\"\n",
" Hauptmethode zum Ausführen von Experimenten mit verschiedenen Anzahlen von PCA-Komponenten.\n",
"\n",
" Parameter:\n",
" n_pca_components : List[int]\n",
" Eine Liste von Anzahlen der PCA-Komponenten, die in den Experimenten verwendet werden sollen.\n",
" n_experiments : int\n",
" Anzahl der durchzuführenden Experimente pro PCA-Komponentenanzahl.\n",
" n_epochs : int, optional\n",
" Anzahl der Epochen für das Training (Standardwert ist 200).\n",
" learning_rate : float, optional\n",
" Lernrate für den Optimierer (Standardwert ist 0.0005).\n",
" batch_size : int, optional\n",
" Batch-Größe für das Laden der Daten (Standardwert ist 64).\n",
" verbose : bool, optional\n",
" Gibt an, ob der Trainingsfortschritt angezeigt werden soll (Standardwert ist True).\n",
"\n",
" Rückgabewerte:\n",
" Dict\n",
" Ein Wörterbuch, das die Ergebnisse der Experimente für jede Anzahl von PCA-Komponenten enthält.\n",
" \"\"\"\n",
2024-01-05 15:16:59 +01:00
" if not isinstance(n_pca_components, list):\n",
" raise TypeError(\"n_pca_components must be a list of integers!\")\n",
"\n",
2024-01-05 15:46:19 +01:00
" plt.ioff()\n",
2024-01-05 15:16:59 +01:00
" self.n_pca_components = n_pca_components\n",
"\n",
" results = {}\n",
"\n",
" for n_pca_comps in n_pca_components:\n",
" genome_dataset = GenomeDataset(data_frame, n_pca_components=n_pca_comps)\n",
" train_dataset = genome_dataset.train_df\n",
" valid_dataset = genome_dataset.val_df\n",
"\n",
" train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n",
" valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)\n",
"\n",
" res = self.run_single_pca_experiment(train_loader, valid_loader, n_pca_comps, n_experiments, n_epochs=n_epochs, learning_rate=learning_rate, verbose=verbose)\n",
" results[str(n_pca_comps)] = res\n",
"\n",
" self.plot_and_save_results(res, n_pca_comps)\n",
"\n",
" self.results = results\n",
"\n",
2024-01-05 15:46:19 +01:00
" # Speichern der Daten in einer lokalen Datei\n",
" if len(n_pca_components) > 1:\n",
" with open('Experiments/results.pickle', 'wb') as f:\n",
" pickle.dump(self.results, f)\n",
" else:\n",
" with open(f'Experiments/{str(n_pca_components[0])}/results_{str(n_pca_components[0])}.pickle', 'wb') as f:\n",
" pickle.dump(self.results, f)\n",
2024-01-05 15:46:19 +01:00
"\n",
" plt.ion()\n",
"\n",
2024-01-05 15:16:59 +01:00
" return results\n",
"\n",
2024-01-08 15:19:52 +01:00
" def calculate_confusion_matrix(self, n_pca_components: int, n_epochs: int = 500, batch_size: int = 64, learning_rate: float = 0.0005, verbose: bool = True):\n",
" \"\"\"\n",
" Trainiert ein Modell und erstellt eine Konfusionsmatrix\n",
" \"\"\"\n",
" all_preds = []\n",
" all_targets = []\n",
"\n",
" genome_dataset = GenomeDataset(data_frame, n_pca_components=n_pca_components)\n",
" train_dataset = genome_dataset.train_df\n",
" valid_dataset = genome_dataset.val_df\n",
"\n",
" train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n",
" valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)\n",
"\n",
" # if dataset == \"train\":\n",
" # dataset_loader = train_loader\n",
" # elif dataset == \"valid\":\n",
" # dataset_loader = valid_loader\n",
" # else:\n",
" # raise Warning(\"Invalid Dataset Type!\")\n",
" # return None\n",
"\n",
" model, _ = self.run_single_experiment(train_loader, valid_loader, n_pca_components, n_epochs, learning_rate, verbose, 0)\n",
" \n",
" def get_predictions_and_targets(loader, model, device):\n",
" all_preds = []\n",
" all_targets = []\n",
" model.eval()\n",
"\n",
" with torch.no_grad():\n",
" for data, target in loader:\n",
" data, target = data.to(device), target.to(device)\n",
" outputs = model(data)\n",
" _, preds = torch.max(outputs, 1)\n",
" all_preds.extend(preds.to('cpu').numpy())\n",
" all_targets.extend(target.to('cpu').numpy())\n",
"\n",
" return all_preds, all_targets\n",
"\n",
" # Sammeln der Vorhersagen und wahren Labels von Trainings- und Validierungsdaten\n",
" train_preds, train_targets = get_predictions_and_targets(train_loader, model, device)\n",
" valid_preds, valid_targets = get_predictions_and_targets(valid_loader, model, device)\n",
"\n",
" # Kombinieren der Daten\n",
" all_preds = train_preds + valid_preds\n",
" all_targets = train_targets + valid_targets\n",
"\n",
" # Berechnen der Konfusionsmatrix\n",
" conf_matrix = confusion_matrix(all_targets, all_preds)\n",
"\n",
" return conf_matrix\n",
"\n",
2024-01-05 15:16:59 +01:00
" def plot_and_save_results(self, results: List[Tuple], n_pca_components: int) -> None:\n",
2024-01-05 15:56:33 +01:00
" \"\"\"\n",
" Erstellt und speichert Plots für die Ergebnisse der Experimente.\n",
"\n",
" Parameter:\n",
" results : List[Tuple]\n",
" Eine Liste von Tupeln mit den Ergebnissen der Experimente.\n",
" n_pca_components : int\n",
" Anzahl der PCA-Komponenten, für die die Ergebnisse geplottet werden sollen.\n",
"\n",
" Keine Rückgabewerte, da die Methode Plots speichert.\n",
" \"\"\"\n",
" \n",
2024-01-05 15:16:59 +01:00
" # Mittelwerte und Standardabweichungen berechnen\n",
" train_losses, valid_losses, train_accuracies, valid_accuracies = zip(*results)\n",
"\n",
" train_losses = np.array(train_losses)\n",
" valid_losses = np.array(valid_losses)\n",
" train_accuracies = np.array(train_accuracies)\n",
" valid_accuracies = np.array(valid_accuracies)\n",
"\n",
" avg_train_losses = np.mean(train_losses, axis=0)\n",
" avg_valid_losses = np.mean(valid_losses, axis=0)\n",
" avg_train_acc = np.mean(train_accuracies, axis=0)\n",
" avg_valid_acc = np.mean(valid_accuracies, axis=0)\n",
"\n",
" std_train_losses = np.std(train_losses, axis=0)\n",
" std_valid_losses = np.std(valid_losses, axis=0)\n",
" std_train_acc = np.std(train_accuracies, axis=0)\n",
" std_valid_acc = np.std(valid_accuracies, axis=0)\n",
"\n",
" # Erstellen von Plots\n",
" epochs = range(1, len(avg_train_losses) + 1)\n",
"\n",
" # Plot für Verluste\n",
2024-01-05 15:46:19 +01:00
" plt.clf()\n",
2024-01-05 15:16:59 +01:00
" plt.plot(epochs, avg_train_losses, label='Mittlerer Trainingsverlust', color='r')\n",
" plt.fill_between(epochs, np.subtract(avg_train_losses, std_train_losses), np.add(avg_train_losses, std_train_losses), color='r', alpha=0.2)\n",
" plt.plot(epochs, avg_valid_losses, label='Mittlerer Validierungsverlust', color='b')\n",
" plt.fill_between(epochs, np.subtract(avg_valid_losses, std_valid_losses), np.add(avg_valid_losses, std_valid_losses), color='b', alpha=0.2)\n",
" plt.title(f'Mittelwert und Standardabweichung der Verluste für {n_pca_components} PCA-Komponenten')\n",
" plt.xlabel('Experiment Nummer')\n",
" plt.ylabel('Verlust')\n",
" plt.legend()\n",
2024-01-05 15:46:19 +01:00
" plt.savefig(f\"Experiments/{n_pca_components}/average_losses.png\", bbox_inches='tight')\n",
2024-01-05 15:16:59 +01:00
" plt.clf()\n",
"\n",
" # Plot für Genauigkeiten\n",
" plt.plot(epochs, avg_train_acc, label='Mittlere Trainingsgenauigkeit', color='r')\n",
" plt.fill_between(epochs, np.subtract(avg_train_acc, std_train_acc), np.add(avg_train_acc, std_train_acc), color='r', alpha=0.2)\n",
" plt.plot(epochs, avg_valid_acc, label='Mittlere Validierungsgenauigkeit', color='b')\n",
" plt.fill_between(epochs, np.subtract(avg_valid_acc, std_valid_acc), np.add(avg_valid_acc, std_valid_acc), color='b', alpha=0.2)\n",
" plt.title(f'Mittelwert und Standardabweichung der Genauigkeiten für {n_pca_components} PCA-Komponenten')\n",
" plt.xlabel('Experiment Nummer')\n",
" plt.ylabel('Genauigkeit')\n",
" plt.legend()\n",
2024-01-05 15:46:19 +01:00
" plt.savefig(f\"Experiments/{n_pca_components}/average_accuracies.png\", bbox_inches='tight')\n",
2024-01-05 15:16:59 +01:00
" plt.clf()\n",
"\n",
" "
2024-01-04 14:47:29 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Starten der einzelnen Experimente, da in einer Schleife RAM Probleme auftreten"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e1 = ExperimentationalExperiments()\n",
"results = e1.run([1024], 10, n_epochs=500)\n",
"del e1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e2 = ExperimentationalExperiments()\n",
"results = e2.run([512], 10, n_epochs=500)\n",
"del e2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e3 = ExperimentationalExperiments()\n",
"results = e3.run([256], 10, n_epochs=500)\n",
"del e3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e4 = ExperimentationalExperiments()\n",
"results = e4.run([128], 10, n_epochs=500)\n",
"del e4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e5 = ExperimentationalExperiments()\n",
"results = e5.run([64], 10, n_epochs=500)\n",
"del e5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e6 = ExperimentationalExperiments()\n",
"results = e6.run([32], 10, n_epochs=500)\n",
"del e6"
]
},
2024-01-04 14:47:29 +01:00
{
"cell_type": "code",
2024-01-05 15:46:19 +01:00
"execution_count": null,
2024-01-04 14:47:29 +01:00
"metadata": {},
2024-01-05 15:46:19 +01:00
"outputs": [],
2024-01-04 14:47:29 +01:00
"source": [
"e7 = ExperimentationalExperiments()\n",
"results = e7.run([16], 10, n_epochs=500)\n",
"del e7"
2024-01-04 14:47:29 +01:00
]
2024-01-04 15:15:24 +01:00
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"e8 = ExperimentationalExperiments()\n",
2024-01-08 15:19:52 +01:00
"results = e8.run([1034], 10, n_epochs=500)\n",
"del e8"
]
},
2024-01-08 15:19:52 +01:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Erstellung der Konfusionsmatrix für 512 Hauptkomponenten"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Experiment #0 mit 512 PCA components: Epoch [500/500], Trainingsverlust: 0.5583, Trainingsgenauigkeit: 0.9927, Validierungsverlust: 0.5903, Validierungsgenauigkeit: 0.9517\n"
]
}
],
"source": [
"e = ExperimentationalExperiments()\n",
"conf_matrix = e.calculate_confusion_matrix(512, n_epochs=500)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5cAAAHWCAYAAADjDn0FAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8WgzjOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd3hN2dfHvze9JyI9IpFE70KQIEqILnonusEYM/rPGDF6iYmXwWCU0ctgzKhhRO9EDYIgCFGCRPq96/1jz7n3nluSm67sz/OcJzn77L73Ofess9ZeW0JEBA6Hw+FwOBwOh8PhcPKBXnFXgMPhcDgcDofD4XA4nz9cuORwOBwOh8PhcDgcTr7hwiWHw+FwOBwOh8PhcPINFy45HA6Hw+FwOBwOh5NvuHDJ4XA4HA6Hw+FwOJx8w4VLDofD4XA4HA6Hw+HkGy5ccjgcDofD4XA4HA4n33DhksPhcDgcDofD4XA4+YYLlxwOh8PhcDgcDofDyTdflXAZGhoKiURS3NXg5BOJRILQ0NA8pfXw8EBISEiB1udLorjukXXr1kEikeDRo0fysMaNG6Nx48Y5po2MjIREIkFkZKQ8LCQkBB4eHgVeT46Cr7WPHz16BIlEgnXr1hV3VT4J8vPMENK+fv26QOqSn98GTXXauXOn2rMFUDxzdu7cma+yCoP8PEuLm4IYv/ygqe84HE7uyZVwKdx42o5z584VVj2/KmbPno09e/bkKS0RwdbWFr/99hsA4OrVq1ofljKZDPPnz0eZMmVgYmKCatWqYcuWLXkqN6e5IRxf48vol0RmZibs7OzQoEEDrXGICG5ubqhVq1YR1ozzpbFs2TIuwHHyxZkzZxAaGop3797pnKZTp07YsGEDLC0t4evriw0bNqBixYqFV8nPDEGwzulQFcjzQl7Gr6DhzyEOJ/cY5CXRzz//jDJlyqiFe3t757tChcmPP/6ISZMmFXc1cmT27Nno0qULgoODc502JiYGiYmJqFevHgDg7NmzcHR01CjUTZkyBXPnzsWQIUNQp04d/PXXX+jVqxckEgl69OiRq3IbNWqEDRs2iMIGDx4MX19fDB06VB5mYWGR6zapkpqaCgODPE1d3L17F3p6X5XCvkAxNDRE165d8dtvv+Hx48dwd3dXi3PixAk8ffoU33//fb7KOnz4cJ7Trlq1CjKZLF/lc4qXZcuWwc7OjlsafOJ8Sr+rqr8NZ86cwfTp0xESEgIbGxud8qhWrRqqVasGAChdujT69OlTGFUtUvLzLFWlYsWKar/1AsnJyRgzZgxMTU1Rrly5XOddEOOXH/r27YsePXrA2NhYHsafQxxO7snTG3qrVq1Qu3btgq5LofHx40eYm5vDwMAgz0LJ58KFCxdgYWGBKlWqAGDCZd26ddXiPXv2DGFhYRg5ciSWLl0KgAmDAQEBGD9+PLp27Qp9fX2dy/X09ISnp6cobPjw4fD09Mz2xzkrKwsymQxGRkY6l2ViYqJzXFWUfzQ4eaN3795YsWIFtmzZovGlcvPmzdDT08v1BwpVcjMnVDE0NMxX2coQEdLS0mBqalpgeX7OCM9TDgfAJ/W7mp/fhk+RgrrX8vMsVcXR0VHrb3qfPn2Qnp6OzZs3w8XFJdd5F/f46evr5+q9h8PhaKZQVDjTpk2Dnp4ejh49KgofOnQojIyMcO3aNQAK84pt27bhf//7H5ycnGBubo727dsjLi5OLd/z58+jZcuWsLa2hpmZGQICAnD69GlRHGG9xO3bt9GrVy+UKFFCbsKnaW2IRCLBqFGjsGPHDlSqVAmmpqaoX78+bty4AQD47bff4O3tDRMTEzRu3FijeWlu6nX//n35Vzhra2sMGDAAKSkpovp8/PgR69evl5uX5PTFLDk5Ga9fv8br169x6tQpVK1aFYmJiXj9+jXOnj2LSpUq4fXr10hMTJSn+euvv5CZmYkRI0aIyv7mm2/w9OlTnD17Vh7+/v173LlzB+/fv8+2HjkhrFVauHAhwsPD4eXlBWNjY9y+fRsZGRn46aef4OPjA2tra5ibm6Nhw4Y4duyYWj6q6zJ07VtAfc2lYM57+vRp/PDDD7C3t4e5uTk6duyIV69eidLKZDKEhobCxcUFZmZmaNKkCW7fvq2WZ2ZmJqZPn46yZcvCxMQEJUuWRIMGDRAREZGnftO2ti27+bxnzx5UqVIFxsbGqFy5Mg4ePKiW/tSpU6hTpw5MTEzg5eUlN6XOCX9/f3h4eGDz5s1q1zIzM7Fz5040adIELi4uuH79OkJCQuDp6QkTExM4OTlh4MCBePPmTY7laFon9PTpUwQHB8Pc3BwODg74/vvvkZ6erpZWU5/JZDKEh4ejcuXKMDExgaOjI4YNGya6LwA2R9q2bYtDhw6hdu3aMDU1xW+//ZbtWrv8zMnU1FSMHj0adnZ2sLS0RPv27fHs2TO1PJOSkjBmzBh4eHjA2NgYDg4OaN68Oa5cuQIAGDVqFCwsLNTyB4CePXvCyckJUqlUHnbgwAE0bNgQ5ubmsLS0RJs2bXDr1i21frSwsMCDBw/QunVrWFpaonfv3mr5A5rXvgKa1yi+ePECAwYMQKlSpWBsbAxnZ2d06NBB/nz18PDArVu3cPz4cflzMLs1Y7kpW2jTs2fPEBwcDAsLC9jb22PcuHGi/gGAd+/eISQkBNbW1rCxsUH//v0LxEQvMjIStWvXFt172tYubty4ET4+PjA1NYWtrS169Oih9vvYuHFjVKlSBbdv30aTJk1gZmYGV1dXzJ8/XxRP1+dsbvpTU711ndOaePz4Mby9vVGlShW8fPkSABuHMWPGwM3NDcbGxvD29sa8efPUrBOU8w8NDcX48eMBAGXKlJHPo+zW02lbk69tzaJUKi2Sdxdt3Lp1C02bNoWpqSlKlSqFmTNnarTY0FT/9PR0TJs2Dd7e3jA2NoabmxsmTJig8XmqC2vWrMGmTZvwzTffoFOnTqJrRTV+wn1w/fp1BAQEwMzMDN7e3vK1scePH0fdunVhamqK8uXL48iRI6L0qmsuc/sc4nA4jDx9bnz//r3aAnyJRIKSJUsCYGYyf//9NwYNGoQbN27A0tIShw4dwqpVqzBjxgxUr15dlHbWrFmQSCSYOHEiEhISEB4ejsDAQERFRcm1Bf/++y9atWoFHx8fufC6du1aNG3aFCdPnoSvr68oz65du6Js2bKYPXs2iCjb9pw8eRJ79+7FyJEjAQBz5sxB27ZtMWHCBCxbtgwjRoxAYmIi5s+fj4EDB+Lff/+Vp81tvbp164YyZcpgzpw5uHLlClavXg0HBwfMmzcPALBhwwY1c1IvL69s6z9q1CisX79eFGZvby//f+7cuZg7dy7c3d3lD82rV6/C3NxcbS2JUN+rV6/Kf9h2796NAQMGYO3atQViGrJ27VqkpaVh6NChMDY2hq2tLT58+IDVq1ejZ8+eGDJkCJKSkvD7778jKCgIFy5cQI0aNXLMN6e+zY5vv/0WJUqUwLRp0/Do0SOEh4dj1KhR2LZtmzzO5MmTMX/+fLRr1w5BQUG4du0agoKCkJaWJsorNDQUc+bMkY/jhw8fcOnSJVy5cgXNmzfPdX/lllOnTmHXrl0YMWIELC0t8X//93/o3Lkznjx5Ir9Hb9y4gRYtWsDe3h6hoaHIysrCtGnT4OjomGP+EokEvXr1wuzZs3Hr1i1UrlxZfu3gwYN4+/atXACJiIjAw4cPMWDAADg5OeHWrVtYuXIlbt26hXPnzuXKEUhqaiqaNWuGJ0+eYPTo0XBxccGGDRtE92N2DBs2DOvWrcOAAQMwevRoxMbGYunSpbh69SpOnz4t0nbevXsXPXv2xLBhwzBkyBCUL19e53oqo8ucDAkJwfbt29G3b1/Uq1cPx48fR5s2bdTyGj58OHbu3IlRo0ahUqVKePPmDU6dOoXo6GjUqlUL3bt3x6+//op9+/aha9eu8nQpKSn4+++/ERISIv8qv2HDBvTv3x9BQUGYN28eUlJSsHz5cjRo0ABXr14VCeZZWVkICgpCgwYNsHDhQpiZmeWpL5Tp3Lkzbt26hW+//RYeHh5ISEhAREQEnjx
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxwAAANXCAYAAAC/mFmnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8WgzjOAAAACXBIWXMAAA9hAAAPYQGoP6dpAABjJ0lEQVR4nO3deVhU9fv/8deAbIKAIAKaWy4orqWl5J6ammWWLS4lltknU8pwy3LPorS03NJKsUWzXNo0TVwrt8xdc0kzqRQwS3FJQDi/P/o535nQYugcZ9Dn47rmuphzzszcM36mDy/u+7yPzTAMQwAAAABgAS93FwAAAADg6kXgAAAAAGAZAgcAAAAAyxA4AAAAAFiGwAEAAADAMgQOAAAAAJYhcAAAAACwDIEDAAAAgGUIHAAAAAAsQ+AAgP9v/Pjxuv766+Xt7a169eqZ/vw9e/ZUxYoVTX/eq03FihXVs2dPd5cBADAJgQOAR5k9e7ZsNpu+++47p+2nTp3SzTffLH9/fy1btsz0112+fLkGDx6sxo0bKzk5WS+++KLpr3Gt+f777zVq1Cj99NNP7i4FAOBGxdxdAAD8m8zMTN12223auXOnPv74Y7Vr187011i1apW8vLw0c+ZM+fr6mv78kvTWW28pLy/Pkuf2RN9//71Gjx6tFi1auNTZ2b9/v7y8+HsYAFwtCBwAPNrp06fVtm1bbd++XYsWLVL79u0teZ2MjAwFBARYFjYkycfHx7LnLuoMw9D58+cVEBAgPz8/d5cDADARf0IC4LHOnDmjdu3aaevWrVq4cKE6dOhg37dt2za1b99ewcHBCgoKUqtWrbRx40anx18cz1q3bp0SExMVERGhwMBA3X333Tp+/Lj9OJvNpuTkZJ09e1Y2m002m02zZ8/WTz/9ZP/572w2m0aNGmW/f/r0afXv318VK1aUn5+fSpcurTZt2mjr1q32Yy51DsfZs2c1YMAAlStXTn5+foqJidErr7wiwzDyvV6/fv30ySefqFatWvLz81PNmjXzjZcVpI4WLVqoVq1a2rlzp5o3b67ixYurSpUqWrBggSRp7dq1atiwoQICAhQTE6MVK1Y4vcaRI0f0xBNPKCYmRgEBAQoPD9d9993nNDo1e/Zs3XfffZKkli1b2j/XNWvWSPrrPI077rhDX375pRo0aKCAgADNmDHDvu/iORyGYahly5aKiIhQRkaG/fmzs7NVu3ZtVa5cWWfPns337wMA8BwEDgAe6ezZs2rfvr02b96s+fPn64477rDv27Nnj5o2baodO3Zo8ODBGj58uA4fPqwWLVpo06ZN+Z4rISFBO3bs0MiRI9WnTx99/vnn6tevn33/e++9p6ZNm8rPz0/vvfee3nvvPTVr1syleh9//HG98cYb6ty5s6ZNm6aBAwcqICBAe/fuvexjDMNQx44dNXHiRLVr104TJkxQTEyMBg0apMTExHzHf/PNN3riiSfUpUsXjRs3TufPn1fnzp114sQJl+v4448/dMcdd6hhw4YaN26c/Pz81KVLF3344Yfq0qWLbr/9dr300ks6e/as7r33Xp0+fdr+2M2bN2v9+vXq0qWLJk2apMcff1wrV65UixYtdO7cOUlSs2bN9OSTT0qSnn32WfvnWqNGDfvz7N+/X127dlWbNm30+uuvX/JEfZvNplmzZun8+fN6/PHH7dtHjhypPXv2KDk5WYGBgf/yrwMAcCsDADxIcnKyIcmoUKGC4ePjY3zyySf5junUqZPh6+trHDp0yL7t6NGjRokSJYxmzZrle67WrVsbeXl59u1PP/204e3tbZw8edK+LT4+3ggMDHR6ncOHDxuSjOTk5Hw1SDJGjhxpvx8SEmL07dv3H99bfHy8UaFCBfv9Tz75xJBkjB071um4e++917DZbMbBgwedXs/X19dp244dOwxJxuTJk12qo3nz5oYkY+7cufZt+/btMyQZXl5exsaNG+3bv/zyy3yfwblz5/I954YNGwxJxrvvvmvfNn/+fEOSsXr16nzHV6hQwZBkLFu27JL74uPjnbbNmDHDkGS8//77xsaNGw1vb2+jf//+//g+AQCegQ4HAI+Unp4uf39/lStXzml7bm6uli9frk6dOun666+3b4+Ojla3bt30zTffKDMz0+kxjz32mGw2m/1+06ZNlZubqyNHjphWb2hoqDZt2qSjR48W+DFffPGFvL297Z2AiwYMGCDDMLR06VKn7a1bt1blypXt9+vUqaPg4GD9+OOPLtcRFBSkLl262O/HxMQoNDRUNWrUUMOGDe3bL/7s+BoBAQH2n3NycnTixAlVqVJFoaGhTqNb/6ZSpUpq27ZtgY597LHH1LZtWyUkJOihhx5S5cqVWUkMAIoIAgcAjzRjxgz5+vqqXbt22r9/v3378ePHde7cOcXExOR7TI0aNZSXl6eff/7ZaXv58uWd7pcsWVLSX2NFZhk3bpx2796tcuXK6eabb9aoUaOcfkm/lCNHjqhMmTIqUaKE0/aLY0d/D0R/fx/SX+/F8X0UtI7rrrvOKYRJUkhISL6AFxISIsn5s/rzzz81YsQI+3knpUqVUkREhE6ePKlTp07943t2VKlSpQIfK0kzZ87UuXPn9MMPP2j27NlOwQcA4LkIHAA8UmxsrL744gv9+eefatOmTb4Q4Qpvb+9Lbjf+dmL23/39F/KLcnNz8227//779eOPP2ry5MkqU6aMxo8fr5o1a+brUvwXBXkfBa3jcs9VkNdISEjQCy+8oPvvv18fffSRli9frpSUFIWHh7u07K+rgWHNmjXKysqSJO3atculxwIA3IfAAcBj3Xzzzfrkk0+UkZGhNm3a6Pjx44qIiFDx4sWduh4X7du3T15eXvn+Sl9YFzshJ0+edNp+uVGs6OhoPfHEE/rkk090+PBhhYeH64UXXrjs81eoUEFHjx51OiFb+ut9XNxfGK7W4aoFCxYoPj5er776qu699161adNGTZo0yfc5XS6wFcaxY8eUkJCg2267TXfccYcGDhxo6kgcAMA6BA4AHq1Vq1b64IMPdPDgQbVr105nz57Vbbfdpk8//dRpGdb09HTNnTtXTZo0UXBwsCmvHRwcrFKlSumrr75y2j5t2jSn+7m5uflGiUqXLq0yZcrY/yJ/Kbfffrtyc3M1ZcoUp+0TJ06UzWZz+Zojha3DVd7e3vm6Q5MnT87X+bm4etTfg0hh9O7dW3l5eZo5c6befPNNFStWTL169frXLhUAwP248B8Aj3f33Xfrrbfe0iOPPKKOHTtq6tSpSklJUZMmTfTEE0+oWLFimjFjhrKysjRu3DhTX/vRRx/VSy+9pEcffVQNGjTQV199pQMHDjgdc/r0aV133XW69957VbduXQUFBWnFihXavHmzXn311cs+95133qmWLVvqueee008//aS6detq+fLl+vTTT9W/f3+nE8QLorB1uOqOO+7Qe++9p5CQEMXGxmrDhg1asWKFwsPDnY6rV6+evL299fLLL+vUqVPy8/PTrbfeqtKlS7v0esnJyVqyZIlmz56t6667TtJfAefBBx/UG2+8oSeeeMK09wYAMB+BA0CR8PDDD+v333/XwIED9cwzz2j16tUaPny4kpKSlJeXp4YNG+r99993WmHJDCNGjNDx48e1YMECffTRR2rfvr2WLl3q9Etz8eLF9cQTT2j58uVatGiR8vLyVKVKFU2bNk19+vS57HN7eXnps88+04gRI/Thhx8qOTlZFStW1Pjx4zVgwACXay1sHa56/fXX5e3trTlz5uj8+fNq3LixVqxYkW/FqaioKE2fPl1JSUnq1auXcnNztXr1apcCxy+//KKnn35ad955p+Lj4+3bu3fvroULF2rw4MFq3769yyegAwCuHJtBPxoAAACARTiHAwAAAIBlCBwAAAAALEPgAAAAAGAZAgcAAAAAyxA4AAAAAFiGwAEAAADAMgQOAAAAAJa5Ki/8dyD9nLtLAIqk8uHF3V0CAOAa4e/Bv4UG3NDP3SVc1p/bpri7BJfR4QAAAABgGQIHAAAAAMt4cDMLAAAAcAMbf5M3E58mAAAAAMsQOAAAAABYhpE
"text/plain": [
"<Figure size 1000x1000 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
"def plot_confusion_matrix(conf_matrix, classes, title='Konfusionsmatrix', cmap=plt.cm.Blues):\n",
" \"\"\"\n",
" Zeichnet die Konfusionsmatrix.\n",
"\n",
" Parameters:\n",
" conf_matrix (np.array): Die Konfusionsmatrix.\n",
" classes (List): Eine Liste der Klassennamen.\n",
" title (str): Der Titel der Grafik.\n",
" cmap (matplotlib.colors.Colormap): Die Farbkarte für die Matrix.\n",
" \"\"\"\n",
" plt.figure(figsize=(10, 10))\n",
" sns.heatmap(conf_matrix, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)\n",
" plt.title(title)\n",
" plt.ylabel('Wahre Labels')\n",
" plt.xlabel('Vorhergesagte Labels')\n",
" plt.show()\n",
"\n",
"plot_confusion_matrix(conf_matrix, [\"kich\", \"kirc\", \"kirp\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lesen der Daten und Erstellen der Mittelwerte und anschließender Auswertung"
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pickle\n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"from typing import List, Dict, Tuple\n",
"\n",
"def load_results(path: str) -> Dict:\n",
" \"\"\"\n",
" Lädt und konvertiert die Ergebnisse aus Pickle-Dateien in den spezifizierten Verzeichnissen.\n",
"\n",
" Argumente:\n",
" path (str): Der Pfad zum Basisverzeichnis, das die Ergebnis-Unterverzeichnisse enthält.\n",
"\n",
" Rückgabe:\n",
" Dict: Ein Dictionary, in dem jeder Schlüssel einer PCA-Komponentenanzahl entspricht und\n",
" dessen Werte Achtertupel aus Durchschnitt und Standardabweichung von Trainingsverlust,\n",
" Validierungsverlust, Trainingsgenauigkeit und Validierungsgenauigkeit sind.\n",
" \"\"\"\n",
"\n",
" results = {}\n",
"\n",
" # Über alle Ordner im Basispfad iterieren\n",
" for directory in os.listdir(path):\n",
" full_path = os.path.join(path, directory)\n",
"\n",
" # Überprüfen, ob es sich um einen Ordner handelt\n",
" if os.path.isdir(full_path):\n",
" pickle_file = f'results_{directory}.pickle'\n",
" pickle_path = os.path.join(full_path, pickle_file)\n",
"\n",
" # Überprüfen, ob die Pickle-Datei existiert\n",
" if os.path.isfile(pickle_path):\n",
" # Pickle-Datei laden\n",
" with open(pickle_path, 'rb') as file:\n",
" results[directory] = pickle.load(file)\n",
"\n",
" converted_results = {}\n",
" for values in list(results.values()):\n",
" key = list(values.keys())[0]\n",
" value = list(values.values())[0]\n",
" converted_results[key] = value\n",
"\n",
" return converted_results\n",
"\n",
"\n",
"def calculate_means_for_n_last(results: Dict, n_last: int) -> Dict:\n",
" \"\"\"\n",
" Berechnet Durchschnittswerte und Standardabweichungen für die letzten `n_last` Ergebnisse.\n",
"\n",
" Argumente:\n",
" results (Dict): Ein Dictionary von Ergebnissen, wie von `load_results` zurückgegeben.\n",
" n_last (int): Anzahl der letzten Ergebnisse, die zur Berechnung verwendet werden sollen.\n",
"\n",
" Rückgabe:\n",
" Dict: Ein Dictionary mit Schlüsseln als PCA-Komponentenanzahlen und Werten als Achtertupel,\n",
" bestehend aus Durchschnitt und Standardabweichung von Trainingsverlust, \n",
" Validierungsverlust, Trainingsgenauigkeit und Validierungsgenauigkeit.\n",
" \"\"\"\n",
"\n",
" assert results is not None\n",
" assert n_last <= len(list(results.values())[0][0][0])\n",
"\n",
" means_and_stds = {}\n",
"\n",
" for key, value in results.items():\n",
" train_losses, valid_losses, train_accuracies, valid_accuracies = zip(*value)\n",
" \n",
" train_losses = train_losses[:n_last] \n",
" valid_losses = valid_losses[:n_last]\n",
" train_accuracies = train_accuracies[:n_last]\n",
" valid_accuracies = valid_accuracies[:n_last]\n",
"\n",
" avg_train_loss = np.mean(train_losses)#, axis=0)\n",
" avg_valid_loss = np.mean(valid_losses)#, axis=0)\n",
" avg_train_acc = np.mean(train_accuracies)#, axis=0)\n",
" avg_valid_acc = np.mean(valid_accuracies)#, axis=0)\n",
"\n",
" std_train_loss = np.std(train_losses)#, axis=0)\n",
" std_valid_loss = np.std(valid_losses)#, axis=0)\n",
" std_train_acc = np.std(train_accuracies)#, axis=0)\n",
" std_valid_acc = np.std(valid_accuracies)#, axis=0)\n",
"\n",
" print(f\"### {key} PCA Komponenten ###\")\n",
" print(f\"Trainingsverlust: {avg_train_loss:.3f} \\u00B1 {std_train_loss:.3f}\")\n",
" print(f\"Validierungsverlust: {avg_valid_loss:.3f} \\u00B1 {std_valid_loss:.3f}\")\n",
" print(f\"Trainingsgenauigkeit: {avg_train_acc:.3f} \\u00B1 {std_train_acc:.3f}\")\n",
" print(f\"Validierungsgenauigkeit: {avg_valid_acc:.3f} \\u00B1 {std_valid_acc:.3f}\\n\")\n",
"\n",
" means_and_stds[key] = (avg_train_loss, std_train_loss, avg_valid_loss, std_valid_loss, avg_train_acc, std_train_acc, avg_valid_acc, std_valid_acc)\n",
"\n",
" # Initialisierung der Variablen für die Minima und Maxima\n",
" min_train_loss = float('inf')\n",
" min_valid_loss = float('inf')\n",
" max_train_acc = 0\n",
" max_valid_acc = 0\n",
"\n",
" # Durchlaufen aller berechneten Mittelwerte und Standardabweichungen\n",
" for key, (avg_train_loss, std_train_loss, avg_valid_loss, std_valid_loss, avg_train_acc, std_train_acc, avg_valid_acc, std_valid_acc) in means_and_stds.items():\n",
" if avg_train_loss < min_train_loss:\n",
" min_train_loss = avg_train_loss\n",
" min_train_loss_key = key\n",
"\n",
" if avg_valid_loss < min_valid_loss:\n",
" min_valid_loss = avg_valid_loss\n",
" min_valid_loss_key = key\n",
"\n",
" if avg_train_acc > max_train_acc:\n",
" max_train_acc = avg_train_acc\n",
" max_train_acc_key = key\n",
"\n",
" if avg_valid_acc > max_valid_acc:\n",
" max_valid_acc = avg_valid_acc\n",
" max_valid_acc_key = key\n",
"\n",
" # Drucken der Endresultate\n",
" print(f\"### Auswertung ###\")\n",
" print(f\"Niedrigster Trainingsverlust: {min_train_loss:.3f} bei {min_train_loss_key} PCA-Komponenten\")\n",
" print(f\"Niedrigster Validierungsverlust: {min_valid_loss:.3f} bei {min_valid_loss_key} PCA-Komponenten\")\n",
" print(f\"Höchste Trainingsgenauigkeit: {max_train_acc:.3f} bei {max_train_acc_key} PCA-Komponenten\")\n",
" print(f\"Höchste Validierungsgenauigkeit: {max_valid_acc:.3f} bei {max_valid_acc_key} PCA-Komponenten\")\n",
"\n",
" return means_and_stds\n",
"\n",
"\n",
"def plot_results(results: Dict, show_lines: bool = True) -> None:\n",
" \"\"\"\n",
" Stellt die Ergebnisse als Fehlerbalkendiagramme dar. Jedes Diagramm zeigt Mittelwert und\n",
" Standardabweichung von Trainings- und Validierungsverlust sowie -genauigkeit. \n",
" Fügt zusätzlich eine rote Linie für den höchsten Genauigkeitswert und den geringsten Verlustwert hinzu,\n",
" mit einer Beschriftung, die den Schlüssel des entsprechenden höchsten bzw. niedrigsten Werts anzeigt.\n",
"\n",
" Argumente:\n",
" results (Dict): Ein Dictionary von berechneten Mittelwerten und Standardabweichungen,\n",
" wie von `calculate_means_for_n_last` zurückgegeben.\n",
" show_lines (bool): Ein flag, das angibt, ob die Maximal- / Minimallinie gezeichnet werden soll.\n",
" \"\"\"\n",
" # Schlüssel sortieren\n",
" sorted_keys = sorted(results.keys(), key=lambda x: int(x))\n",
"\n",
" # Listen für das Plotten vorbereiten\n",
" mean_train_loss = [results[k][0] for k in sorted_keys]\n",
" std_train_loss = [results[k][1] for k in sorted_keys]\n",
" mean_validation_loss = [results[k][2] for k in sorted_keys]\n",
" std_validation_loss = [results[k][3] for k in sorted_keys]\n",
" mean_train_accuracy = [results[k][4] for k in sorted_keys]\n",
" std_train_accuracy = [results[k][5] for k in sorted_keys]\n",
" mean_validation_accuracy = [results[k][6] for k in sorted_keys]\n",
" std_validation_accuracy = [results[k][7] for k in sorted_keys]\n",
"\n",
" # Plotten\n",
" plt.figure(figsize=(12, 8))\n",
"\n",
" # Verluste\n",
" plt.errorbar(sorted_keys, mean_train_loss, yerr=std_train_loss, label='Trainingverlust', fmt='o', linestyle='--', alpha=0.5)\n",
" plt.errorbar(sorted_keys, mean_validation_loss, yerr=std_validation_loss, label='Validierungsverlust', fmt='o', linestyle='--', alpha=0.5)\n",
"\n",
" # Genauigkeiten\n",
" plt.errorbar(sorted_keys, mean_train_accuracy, yerr=std_train_accuracy, label='Trainingsgenauigkeit', fmt='x', linestyle='--', alpha=0.5)\n",
" plt.errorbar(sorted_keys, mean_validation_accuracy, yerr=std_validation_accuracy, label='Validierungsgenauigkeit', fmt='x', linestyle='--', alpha=0.5)\n",
"\n",
" # Gestaltung\n",
" plt.xlabel('Anzahl der PCA Komponenten')\n",
" plt.ylabel('Werte')\n",
" plt.title('Trainings- und Validierungsverlust und -genauigkeit')\n",
" plt.grid(True)\n",
"\n",
" # Höchste Genauigkeit und geringster Verlust\n",
" highest_accuracy = max(max(mean_train_accuracy), max(mean_validation_accuracy))\n",
" lowest_loss = min(min(mean_train_loss), min(mean_validation_loss))\n",
"\n",
" # Schlüssel für höchste Genauigkeit und geringsten Verlust finden\n",
" highest_acc_key = sorted_keys[mean_train_accuracy.index(max(mean_train_accuracy))] if max(mean_train_accuracy) > max(mean_validation_accuracy) else sorted_keys[mean_validation_accuracy.index(max(mean_validation_accuracy))]\n",
" lowest_loss_key = sorted_keys[mean_train_loss.index(min(mean_train_loss))] if min(mean_train_loss) < min(mean_validation_loss) else sorted_keys[mean_validation_loss.index(min(mean_validation_loss))]\n",
"\n",
" plt.legend()\n",
"\n",
" # Linien und Text für höchste Genauigkeit und geringsten Verlust\n",
" if show_lines:\n",
" plt.axhline(y=highest_accuracy, color='r', linestyle='-', alpha=0.8)\n",
" plt.text(0.95, highest_accuracy, f'Höchste Genauigkeit (PCA: {highest_acc_key})', verticalalignment='bottom', horizontalalignment='right', color='red', fontsize=10, transform=plt.gca().get_yaxis_transform())\n",
"\n",
" plt.axhline(y=lowest_loss, color='r', linestyle='-', alpha=0.8)\n",
" plt.text(0.95, lowest_loss, f'Geringster Verlust (PCA: {lowest_loss_key})', verticalalignment='top', horizontalalignment='right', color='red', fontsize=10, transform=plt.gca().get_yaxis_transform())\n",
" \n",
" plt.savefig('Experiments/Endergebnisse_mit_Linien.png')\n",
" else:\n",
" plt.savefig('Experiments/Endergebnisse_ohne_Linien.png')\n",
"\n",
" plt.show()"
]
},
{
"cell_type": "code",
2024-01-08 15:19:52 +01:00
"execution_count": null,
"metadata": {},
2024-01-08 15:19:52 +01:00
"outputs": [],
"source": [
"\n",
"# Ergebnisse laden\n",
"base_path = 'Experiments'\n",
"loaded_results = load_results(base_path)\n",
"\n",
"# Ergebnisse verarbeiten und plotten\n",
"m_a_s = calculate_means_for_n_last(loaded_results, 50)\n",
"plot_results(m_a_s, show_lines=False)\n",
"plot_results(m_a_s)"
]
},
2024-01-04 15:15:24 +01:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO MNIST datenstaz mit wget ohne tensorflow oder pytorch einlesen"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2024-01-04 14:47:29 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "rl",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-01-05 15:16:59 +01:00
"version": "3.8.18"
2024-01-04 14:47:29 +01:00
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}