import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import contextily as ctx
from shapely.geometry import Point

data = pd.read_csv('dhs_final_labels.csv')
data.describe()

# Convert pandas dataframe to a geodataframe which is useful for plotting the location on a map
geo_data = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(data.lon, data.lat))

# Convert coordinate reference system to the Web Mercator system, which is apparently
# the gold standard for online maps
geo_data.set_crs(epsg=4326, inplace=True)
gdf = geo_data.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(10, 10))

# Plot a point denoting the location on a map for each entry of the dataset
gdf.plot(ax=ax, color='black', alpha=0.1)

# Add world map backdrop
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Spacial representation of dataset')

# Such that we can see the whole world
ax.set_xlim(-2e7, 2e7)
ax.set_ylim(-1e7, 1e7)

plt.show()

from matplotlib import colormaps
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import random

def plot_geo_data(data, key, title, cmap='random'):
    """
    Creates a plot over a world map.
    
    Args:
        data (pandas dataframe): The dataframe containing the data to be plotted.
        key (str): The key of the feature (i.e., the column in the dataframe) which is to be plotted.
        title (str): For the title of the plot.

    """
    
    # Create a matplotlib figure and axis with a geo projection
    fig, ax = plt.subplots(figsize=(15, 10), subplot_kw={'projection': ccrs.PlateCarree()})

    # Add features to the map: coastlines, borders, and land
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.add_feature(cfeature.LAND, edgecolor='black')

    # Make sure the whole map is visible
    ax.set_extent([-180, 180, -90, 90], crs=ccrs.PlateCarree())

    cmap_input = cmap
    cmap = random.choice(colormaps()) if cmap_input == 'random' else cmap_input
    if cmap_input == 'random':
        print(cmap)
    scatter = ax.scatter(data['lon'], data['lat'], c=data[key], marker='o', cmap=cmap, transform=ccrs.PlateCarree(), alpha=0.4)

    plt.colorbar(scatter, ax=ax, label=title, shrink=0.6)

    plt.title(f'Spacial Distribution of {title}')
    plt.show()

plot_geo_data(data, 'women_bmi', 'Women\'s Average BMI', cmap='inferno_r')
plot_geo_data(data, 'under5_mort', 'Child Mortality Rate', cmap='inferno_r')
plot_geo_data(data, 'asset_index', 'Asset Index', cmap='RdYlGn')
plot_geo_data(data, 'sanitation_index', 'Sanitation Index', cmap='RdYlGn')
plot_geo_data(data, 'water_index', 'Water Quality Index', cmap='RdYlGn')
plot_geo_data(data, 'women_edu', 'Women\'s Educational Attainment', cmap='RdYlGn')

# Dictionary mapping country codes to countries as mentioned on page 23 of the SustainBench paper

country_mapping = {
    'AL': 'Albania',
    'AM': 'Armenia',
    'AO': 'Angola',
    'BD': 'Bangladesh',
    'BF': 'Burkina Faso',
    'BJ': 'Benin',
    'BO': 'Bolivia',
    'BU': 'Burundi',
    'CD': 'Congo Democratic Republic',
    'CI': 'Cote d’Ivoire',
    'CM': 'Cameroon',
    'CO': 'Colombia',
    'DR': 'Dominican Republic',
    'EG': 'Egypt',
    'ET': 'Ethiopia',
    'GA': 'Gabon',
    'GH': 'Ghana',
    'GN': 'Guinea',
    'GU': 'Guatemala',
    'GY': 'Guyana',
    'HN': 'Honduras',
    'HT': 'Haiti',
    'IA': 'India',
    'ID': 'Indonesia',
    'JO': 'Jordan',
    'KE': 'Kenya',
    'KH': 'Cambodia',
    'KM': 'Comoros',
    'KY': 'Kyrgyz Republic',
    'LB': 'Liberia',
    'LS': 'Lesotho',
    'MA': 'Morocco',
    'MB': 'Moldova',
    'MD': 'Madagascar',
    'ML': 'Mali',
    'MM': 'Myanmar',
    'MW': 'Malawi',
    'MZ': 'Mozambique',
    'NG': 'Nigeria',
    'NI': 'Niger',
    'NM': 'Namibia',
    'NP': 'Nepal',
    'PE': 'Peru',
    'PH': 'Philippines',
    'PK': 'Pakistan',
    'RW': 'Rwanda',
    'SL': 'Sierra Leone',
    'SN': 'Senegal',
    'SZ': 'Eswatini',
    'TD': 'Chad',
    'TG': 'Togo',
    'TJ': 'Tajikistan',
    'TZ': 'Tanzania',
    'UG': 'Uganda',
    'ZM': 'Zambia',
    'ZW': 'Zimbabwe'
}

column_names = ['water_index', 'sanitation_index', 'under5_mort', 'asset_index', 'women_edu', 'women_bmi']

features_format = ['Water Quality Index', 'Sanitation Index', 'Child Mortality per 1000', 'Asset Index', 'Women\'s educational attainment in years', 'Women\'s BMI']

# Create a 3x2 grid of subplots
fig, axs = plt.subplots(3, 2, figsize=(10, 25))
plt.subplots_adjust(wspace=0.6, hspace=0.2)

# Flatten the 3x2 grid for easy iteration
axs = axs.ravel()

for i, feature in enumerate(column_names):
    average_by_nation = data.groupby('cname')[feature].mean()
    average_by_nation = average_by_nation.dropna()
    sorted_average_by_nation = average_by_nation.sort_values(ascending=False)

    countries = []

    
    for nation in sorted_average_by_nation.keys():
        countries.append(country_mapping[nation])
    
    ax = axs[i]
    
    ax.set_title(f'Average {features_format[i]} by country')
    ax.barh(sorted_average_by_nation.keys(), sorted_average_by_nation)
    ax.set_xlabel(features_format[i])
    ax.set_yticks(range(len(countries)), countries, fontsize=7)

    # Print the mean and standard deviation over the means for all countries
    print(f'{feature}: {np.round(np.mean(sorted_average_by_nation), 2)}±{np.round(np.std(sorted_average_by_nation), 2)}')

water_index: 3.71±0.62
sanitation_index: 3.01±0.79
under5_mort: 20.03±35.72
asset_index: 0.09±1.23
women_edu: 6.11±2.77
women_bmi: 23.54±1.94

# Nice format of the feature labels
features_format = ['Water Quality', 'Access to\nSanitation', 'Child\nMortality', 'Assets', 'Women\'s\neducation', 'Women\'s BMI']

# Compute the correlation matrix using the pearson correlation coefficient
correlation_matrix = data[column_names].corr(method='pearson')

import seaborn as sns

# Display the correlation matrix using Seaborn
plt.figure(figsize=(10, 8))
ax = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
ax.set_xticklabels(features_format, rotation=0)
ax.set_yticklabels(features_format, rotation=0)

plt.title('Correlation Matrix')
plt.show()

# Plot covariance, as it displays many correlations intuitively

# Get separate dataframe without NaN values
data_no_nan = data.dropna(inplace=False)

fig, axes = plt.subplots(6, 6, figsize=(10, 10))
plt.subplots_adjust(wspace=0.4, hspace=0.4)

# Insert context regarding the numbers
features_format = ['Water\nQuality\nIndex', 'Access to\nSanitation', 'Child\nMortality\nper 1000', 'Asset Index', 'Women\'s\neducational\nAttainment\nin years', 'Women\'s\naverage\nBMI']

for i1, feat1 in enumerate(column_names):
    for i2, feat2 in enumerate(column_names):
        
        ax = axes[i1, i2]
        
        if i2 == 0:
            ax.set_ylabel(features_format[i1], labelpad=30, rotation=0, va='center')
        if i1 == len(column_names) - 1:
            ax.set_xlabel(features_format[i2], labelpad=10)
        
        ax.scatter(data_no_nan[feat1], data_no_nan[feat2], s=0.5, c='black')

import numpy as np
import matplotlib.pyplot as plt

# Load one of the images
full_image = np.load('dhs/satellite/AL-2008-5#/AL-2008-5#-00000433.npz')['x']

# Create a figure with subplots in a 2x3 grid
fig, axs = plt.subplots(2, 3, figsize=(15, 10))  # Adjust figsize to your needs

# Multiply by 3 to make the image brighter
# RGB representation
axs[0, 0].imshow(full_image[:3].transpose(1, 2, 0) * 3)
axs[0, 0].set_title('RGB representation')
axs[0, 0].axis('off')  # Turn off axis

# NIR representation
axs[0, 1].imshow(full_image[3])
axs[0, 1].set_title('NIR representation')
axs[0, 1].axis('off')

# SWIR1 representation
axs[0, 2].imshow(full_image[4])
axs[0, 2].set_title('SWIR1 representation')
axs[0, 2].axis('off')

# SWIR2 representation
axs[1, 0].imshow(full_image[5])
axs[1, 0].set_title('SWIR2 representation')
axs[1, 0].axis('off')

# TEMP1 representation
axs[1, 1].imshow(full_image[6])
axs[1, 1].set_title('TEMP1 representation')
axs[1, 1].axis('off')

# Nightlight representation
axs[1, 2].imshow(full_image[7])
axs[1, 2].set_title('Nightlight representation')
axs[1, 2].axis('off')

plt.tight_layout()
plt.show()

import matplotlib.image as mpimg

# Create a figure with subplots in a 1x3 grid
fig, axs = plt.subplots(1, 3, figsize=(20, 10))

# Some random image paths
imgs = ['dhs/mapillary/imagery/CO/CO-2010-6#-00001349/1972919636196838.jpeg', 'dhs/mapillary/imagery/MM/MM-2016-7#-00000019/1244428652656362.jpeg', 'dhs/mapillary/imagery/RW/RW-2015-7#-00000028/960400064499984.jpeg']

# Display the image in each subplot
for i, ax in enumerate(axs):
    img = mpimg.imread(imgs[i])
    ax.imshow(img)
    ax.axis('off')
    
plt.show()

import os
from PIL import Image
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
import torchvision.models as models
from torchvision.models import ResNet34_Weights

column_names = ['water_index', 'sanitation_index', 'under5_mort', 'asset_index', 'women_edu', 'women_bmi']
test_prob = 0.15

def preprocess_satellite_images():
    data = pd.read_csv('dhs_final_labels.csv')[column_names + ['DHSID_EA']]
    available_ids = list(data.dropna()['DHSID_EA'])
    
    root_path = 'dhs/satellite'
    
    drop_indices = []
    
    image_id_path_pairs = []
    
    for i, dir in enumerate(os.listdir(root_path)):
        if not os.path.isdir(os.path.join(root_path, dir)):
            continue
    
        for image_file_name in os.listdir(os.path.join(root_path, dir)):
            # First entry of the pair is the name of the image as it appears in the csv
            # The second entry is the relative path to the file from the base directory
            image_id_path_pairs.append(
                (os.path.splitext(image_file_name)[0], os.path.join(root_path, dir, image_file_name)))
    
    print('start iteration')
    for i, (id, path) in enumerate(image_id_path_pairs):
        if not available_ids.__contains__(id):
            drop_indices.append(i)
    
        if i % 10000 == 0:
            print(f'{i}/{len(image_id_path_pairs)} rows checked')
    
    print(f'{len(drop_indices)}/{len(image_id_path_pairs)}')
    
    clean_images_test = []
    clean_images_train = []

    # Split probabilistically
    for i in range(len(image_id_path_pairs)):
        if not drop_indices.__contains__(i):
            if random.random() < test_prob:
                clean_images_test.append(image_id_path_pairs[i])
            else:
                clean_images_train.append(image_id_path_pairs[i])

    # Assert that the sets are mutually exclusive
    assert(len(clean_images_test) + len(clean_images_train) == len(image_id_path_pairs) - len(drop_indices))

    # Save the preprocessed data into a list list of pairs (satellite_image_id (string), path_to_image (string))
    np.save('clear_satellite_image_id_path_pairs_test.npy', clean_images_test)
    np.save('clear_satellite_image_id_path_pairs_train.npy', clean_images_train)

def preprocess_street_images():
    available_ids = list(data.dropna()['DHSID_EA'])

    root_path = 'dhs/mapillary'
    
    # Combine all the metadata to one single dataframe for easy lookup
    metadatas = []
    for csv_metadata in os.listdir(os.path.join(root_path, 'metadata')):
        metadatas.append(pd.read_csv(os.path.join(root_path, 'metadata', csv_metadata)))
    
    metadata = pd.concat(metadatas, ignore_index=True)
    
    drop_indices = []

    # Remove all the metadata which does not correspond to a legal row of DHS data
    print('starting first part')
    for index, row in metadata.iterrows():
        row_id = row['DHSID_EA']
        if not available_ids.__contains__(row_id):
            drop_indices.append(index)
    
        if index % 100000 == 0:
            print(f'{index}/{len(metadata)} rows checked')
    
    cleaned_metadata = metadata.drop(drop_indices)
    print(f'{len(drop_indices)} values dropped')
    test_indexes = []
    print('finished first part')

    # Add the features to the metadata for easier data loading later on and split into train/val and test data
    print('starting the second part')
    for index, row in cleaned_metadata.iterrows():
        if index % 100000 == 0:
            print(f'{index}/{len(cleaned_metadata)} rows checked')
    
        row_id = row['DHSID_EA']
        folder = row_id[:row_id.rindex('-')]
        satellite_image_path = os.path.join('dhs/satellite', folder, row_id+'.npz')
    
        cleaned_metadata.at[index, 'satellite_image_path'] = satellite_image_path
    
        label = []

        if len(data[data['DHSID_EA'] == row_id]) == 0:
            # Error, shouldn't happen
            print('no mapping')
        elif len(data[data['DHSID_EA'] == row_id]) == 2:
            # Error, shouldn't happen
            print('why are there two mappings?')
        else:
            # Extract the label from the DHS dataset
            label = data[data['DHSID_EA'] == row_id].iloc[0][column_names].to_numpy()

        # Add label to combined dataset
        for column_index, column in enumerate(column_names):
            cleaned_metadata.at[index, column] = label[column_index]

        # Add to test set with 15% probability
        if random.random() < test_prob:
            test_indexes.append(index)

    print(f'test set size: {len(test_indexes)}')

    # Create new dataframes from the evaluated indices
    cleaned_metadata_test = cleaned_metadata.loc[test_indexes].copy()
    cleaned_metadata_train = cleaned_metadata.drop(test_indexes)

    # Assert mutual exclusiveness
    assert(len(cleaned_metadata_train) + len(cleaned_metadata_test) == len(cleaned_metadata))

    # Convert dataframes to csv files
    cleaned_metadata_test.to_csv('metadata_clean_test.csv', index=False)
    cleaned_metadata_train.to_csv('metadata_clean_train.csv', index=False)

    print('done!')

class ResNetRegressor(nn.Module):
    def __init__(self, input_size=8, output_size=6):
        super(ResNetRegressor, self).__init__()
        # Load a pre-trained ResNet model
        self.resnet = models.resnet34(weights=ResNet34_Weights.DEFAULT)

        # Modify the first convolution layer to accept a specified number of input channels
        self.resnet.conv1 = nn.Conv2d(input_size, 64, kernel_size=3, stride=2, padding=3, bias=False)

        # Replace the last fully connected layer for regression of six values (output_size=6)
        # ResNet18 originally uses 512 features before the FC layer
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_features, output_size)

    def forward(self, x):
        return self.resnet(x)

    # Register the hooks for learned feature visualisation later on
    def register_hook_to_last_layer(self):
        activations = []
        def get_activation(name):
            def hook(model, input, output):
                activations.append(output.detach())
            return hook
        
        # Attach the hook to the last convolutional layer of the last block
        self.resnet.layer4[-1].conv2.register_forward_hook(get_activation('last_conv'))
        return activations

    def register_hook_to_first_layer(self):
        activations = []
        def get_activation(name):
            def hook(model, input, output):
                activations.append(output.detach())
            return hook
        
        # Attach the hook to the last convolutional layer of the first block
        self.resnet.layer1[-1].conv1.register_forward_hook(get_activation('last_conv_layer1'))
        return activations

    def register_hook_to_second_layer(self):
        activations = []
        def get_activation(name):
            def hook(model, input, output):
                activations.append(output.detach())
            return hook
        
        # Attach the hook to the last convolutional layer of the second block
        self.resnet.layer2[-1].conv2.register_forward_hook(get_activation('last_conv_layer2'))
        return activations

class DHSSatelliteDataset(Dataset):
    def __init__(self, root_path, csv_file, column_names, transform=None, test=False):
        """
        Args:
            root_path (string): Path to the satellite image data.
            csv_file (string): Path to the dhs data csv file.
            column_names (list): List of relevant columns in the dhs dataset.
            transform (callable, optional): Optional transform to be applied on a sample.
            test (bool, optional): True if this is a test set.
        """
        self.column_names = column_names
        self.data = pd.read_csv(csv_file)[self.column_names + ['DHSID_EA']]
        self.transform = transform
        self.root_path = root_path

        # Load test or train/val data depending on the boolean value test
        if test:
            self.image_id_path_pairs = np.load('clear_satellite_image_id_path_pairs_test.npy')
        else:
            self.image_id_path_pairs = np.load('clear_satellite_image_id_path_pairs_train.npy')

    def __len__(self):
        # The length is simply the number of images
        return len(self.image_id_path_pairs)

    def __getitem__(self, idx):

        image_id, image_path = self.image_id_path_pairs[idx]

        satellite_image = np.load(image_path)['x']

        # The eight channels correspond to different measurements performed by the satellite
        assert(satellite_image.shape == (8, 255, 255))

        label = -1

        # This is really just for debugging
        if len(self.data[self.data['DHSID_EA'] == image_id]) == 0:
            print('no mapping')
            return
        elif len(self.data[self.data['DHSID_EA'] == image_id]) == 2:
            print('why are there two mappings?')
            return
        else:
            label = self.data[self.data['DHSID_EA'] == image_id].iloc[0][self.column_names].to_numpy()

        if self.transform:
            satellite_image = self.transform(satellite_image)

        new_label = []

        # This is a bodge, but it works
        for l in label:
            new_label.append(l)

        return satellite_image.permute(1, 0, 2).float(), torch.tensor(new_label).float()

class DHSCombinedDataset(Dataset):
    def __init__(self, root_path, csv_file, satellite_transform=None, street_transform=None, test=False):
        """
        Args:
            root_path (string): Path to the satellite image data.
            csv_file (string): Path to the dhs data csv file.
            street_transform (callable, optional): Optional transform to be applied on the street data.
            test (bool, optional): True if this is a test set.
        """

        self.data = pd.read_csv(csv_file)[column_names + ['DHSID_EA', 'cname']]
        self.data = self.data.dropna()
        self.satellite_transform = satellite_transform
        self.street_transform = street_transform
        self.root_path = root_path

        if test:
            self.metadata = pd.read_csv('metadata_clean_test.csv')
        else:
            self.metadata = pd.read_csv('metadata_clean_train.csv')

    def __len__(self):
        # The length is simply the number of images
        return len(self.metadata)

    def __getitem__(self, idx):

        metadata_row = self.metadata.iloc[idx]

        street_image = Image.open(os.path.join(self.root_path, 'imagery', metadata_row['img_path']))
        satellite_image = np.load(metadata_row['satellite_image_path'])['x']

        label = metadata_row[column_names].to_numpy()

        if self.street_transform:
            street_image_transformed = self.street_transform(street_image)

        if self.satellite_transform:
            satellite_image_transformed = self.satellite_transform(satellite_image).permute(1, 0, 2)
            
        final_image = torch.cat((street_image_transformed, satellite_image_transformed), dim=0)

        new_label = []

        for l in label:
            new_label.append(l)

        new_tensor_label = torch.tensor(new_label)

        return final_image.float(), new_tensor_label.float()

class DHSStreetDataset(Dataset):
    def __init__(self, root_path, csv_file, transform=None, test=False):
        """
        Args:
            root_path (string): Path to the satellite image data.
            csv_file (string): Path to the dhs data csv file.
            satellite_transform (callable, optional): Optional transform to be applied on the satellite data.
            street_transform (callable, optional): Optional transform to be applied on the street data.
            test (bool, optional): True if this is a test set.
        """

        self.data = pd.read_csv(csv_file)[column_names + ['DHSID_EA', 'cname']]
        self.data = self.data.dropna()
        self.transform = transform
        self.root_path = root_path

        if test:
            self.metadata = pd.read_csv('metadata_clean_test.csv')
        else:
            self.metadata = pd.read_csv('metadata_clean_train.csv')

    def __len__(self):
        # The length is simply the number of images
        return len(self.metadata)

    def __getitem__(self, idx):

        metadata_row = self.metadata.iloc[idx]

        image = Image.open(os.path.join(self.root_path, 'imagery', metadata_row['img_path']))
        id = metadata_row['DHSID_EA']

        if len(self.data[self.data['DHSID_EA'] == id]) == 0:
            print('no mapping')
        elif len(self.data[self.data['DHSID_EA'] == id]) == 2:
            print('why are there two mappings?')
        else:
            label = self.data[self.data['DHSID_EA'] == id].iloc[0][column_names].to_numpy()

        if self.transform:
            image = self.transform(image)

        new_label = []

        for l in label:
            new_label.append(l)

        new_tensor_label = torch.tensor(new_label)

        return image.float(), new_tensor_label.float()

# Hyperparameters
num_epochs = 50
batch_size = 256
learning_rate = 0.000005

# Use the GPU if it's available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.MSELoss()

def train_model(model, dataset, name):    
    training_losses = []
    validation_losses = []
    
    # Split the dataset into training, validation, and test sets
    train_size = int(train_split * len(dataset))
    validation_size = len(dataset) - train_size
    train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])
    
    num_workers = 8
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize the model, move it to the GPU
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    print(f'Total training data: {len(train_loader.dataset)}')
    lowest_val_loss = math.inf
    best_model = None

    # Main training loop
    for epoch in range(num_epochs):
        model.train()
        total_epoch_training_loss = 0
        for inputs, targets in train_loader:

            # PyTorch makes training very easy
            optimizer.zero_grad()
            inputs = inputs.to(device)
            targets = targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_epoch_training_loss += loss
            loss.backward()
            optimizer.step()

        normalised_epoch_training_loss = total_epoch_training_loss / len(train_dataset)
        training_losses.append(normalised_epoch_training_loss)

        # Validation loop
        model.eval()
        total_epoch_val_loss = 0
        with torch.no_grad():
            for inputs, targets in validation_loader:
                inputs = inputs.to(device)
                targets = targets.to(device)
                outputs = model(inputs)
                total_epoch_val_loss += criterion(outputs, targets)

        normalised_epoch_val_loss = total_epoch_val_loss / len(validation_dataset)
        validation_losses.append(normalised_epoch_val_loss)

        # If the loss on the validation data is lower than the previously lowest loss, a new
        # best model has been found
        if total_epoch_val_loss < lowest_val_loss:
            lowest_val_loss = total_epoch_val_loss
            torch.save(model.state_dict(), f'models/{name}_best.pth')
            best_model = model
    
        # Print validation accuracy
        print(f'Epoch {epoch+1}/{num_epochs}, Normalised Training Loss: {normalised_epoch_training_loss:.2f} Normalised Validation Loss: {normalised_epoch_val_loss:.2f}')

    return best_model, training_losses, validation_losses

# To make this reproducible
torch.manual_seed(42)

street_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((255, 255), antialias=False),
    transforms.Normalize((0.5,), (0.5,)),
])

satellite_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize the images
])

satellite_dataset = DHSSatelliteDataset('dhs/satellite', 'dhs_final_labels.csv', column_names, satellite_transform, test=False)
satellite_model = ResNetRegressor(input_size=8)

street_dataset = DHSStreetDataset('dhs/mapillary', 'dhs_final_labels.csv', street_transform, test=False)
street_model = ResNetRegressor(input_size=3)

combined_dataset = DHSCombinedDataset('dhs/mapillary', 'dhs_final_labels.csv', satellite_transform=satellite_transform, street_transform=street_transform, test=False)
combined_model = ResNetRegressor(input_size=11)

import pickle

#satellite_model, satellite_train_losses, satellite_val_losses = train_model(satellite_model, satellite_dataset, 'satellite')

#with open('satellite_train_losses.pkl', 'wb') as file:
#    pickle.dump(satellite_train_losses, file)

#with open('satellite_val_losses.pkl', 'wb') as file:
#    pickle.dump(satellite_val_losses, file)

#street_model, street_train_losses, street_val_losses = train_model(street_model, street_dataset, 'street')

#with open('street_train_losses.pkl', 'wb') as file:
#    pickle.dump(street_train_losses, file)

#with open('street_val_losses.pkl', 'wb') as file:
#    pickle.dump(street_val_losses, file)

#combined_model, combined_train_losses, combined_val_losses = train_model(combined_model, combined_dataset, 'combined')

#with open('combined_train_losses.pkl', 'wb') as file:
#    pickle.dump(combined_train_losses, file)

#with open('combined_val_losses.pkl', 'wb') as file:
#    pickle.dump(combined_val_losses, file)

# Load the best models from the file saved during training
satellite_model.load_state_dict(torch.load('models/satellite_best.pth'))
street_model.load_state_dict(torch.load('models/street_best.pth'))
combined_model.load_state_dict(torch.load('models/combined_best.pth'))

# Load the losses for plotting
with open('satellite_train_losses.pkl', 'rb') as file:
    satellite_train_losses = [x.item() for x in pickle.load(file)]

with open('satellite_val_losses.pkl', 'rb') as file:
    satellite_validation_losses = [x.item() for x in pickle.load(file)]

with open('street_train_losses.pkl', 'rb') as file:
    street_train_losses = [x.item() for x in pickle.load(file)]

with open('street_val_losses.pkl', 'rb') as file:
    street_validation_losses = [x.item() for x in pickle.load(file)]

with open('combined_train_losses.pkl', 'rb') as file:
    combined_train_losses = [x.item() for x in pickle.load(file)]

with open('combined_val_losses.pkl', 'rb') as file:
    combined_validation_losses = [x.item() for x in pickle.load(file)]
    
# Plotting
plt.figure(figsize=(12, 7))
plt.plot(list(range(0, len(satellite_train_losses))), satellite_train_losses, color='b', linestyle='--', label='Satellite Train')
plt.plot(list(range(0, len(satellite_validation_losses))), satellite_validation_losses, color='b', label='Satellite Validation')
plt.plot(list(range(0, len(street_validation_losses))), street_train_losses, color='r', linestyle='--', label='Street Train')
plt.plot(list(range(0, len(street_validation_losses))), street_validation_losses, color='r', label='Street Validation')
plt.plot(list(range(0, len(combined_validation_losses))), combined_train_losses, color='g', linestyle='--', label='Satellite + Street Train')
plt.plot(list(range(0, len(combined_validation_losses))), combined_validation_losses, color='g', label='Satellite + Street Validation')
plt.title('Normalised Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Normalised Loss')
plt.grid(True)
plt.legend()
plt.show()

from scipy.stats import pearsonr
print(f'device: {device}')

satellite_dataset_test = DHSSatelliteDataset('dhs/satellite', 'dhs_final_labels.csv', column_names, satellite_transform, test=True)
street_dataset_test = DHSStreetDataset('dhs/mapillary', 'dhs_final_labels.csv', street_transform, test=True)
combined_dataset_test = DHSCombinedDataset('dhs/mapillary', 'dhs_final_labels.csv', satellite_transform=satellite_transform, street_transform=street_transform, test=True)

# Test each of the three models
for name, test_dataset, model in [('satellite', satellite_dataset_test, satellite_model), ('street', street_dataset_test, street_model), ('combined', combined_dataset_test, combined_model)]:
    test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=6)

    model.eval()
    model.to(device)
    total_test_loss = 0

    predictions = []
    actuals = []
    
    # We only need to compute the gradient when training
    # When testing, we can disable the gradient which saves time and computational resources
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            outputs = model(inputs)
            total_test_loss += criterion(outputs, targets)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(targets.cpu().numpy())
    
    # Compute Pearson correlation coefficient for each feature
    for i in range(6):
        corr, _ = pearsonr([p[i] for p in predictions], [a[i] for a in actuals])
        print(f'Squared Pearson correlation coefficient of {name} model for {column_names[i]}: r^2 = {round(corr**2, 5)}')

    print(f'\n{name} normalised test loss: {round(total_test_loss.detach().item() / len(test_dataset), 5)}\n')

device: cuda
Squared Pearson correlation coefficient of satellite model for water_index: r^2 = 0.18964
Squared Pearson correlation coefficient of satellite model for sanitation_index: r^2 = 0.35702
Squared Pearson correlation coefficient of satellite model for under5_mort: r^2 = 0.112
Squared Pearson correlation coefficient of satellite model for asset_index: r^2 = 0.36497
Squared Pearson correlation coefficient of satellite model for women_edu: r^2 = 0.26019
Squared Pearson correlation coefficient of satellite model for women_bmi: r^2 = 0.02987

satellite normalised test loss: 0.74031

Squared Pearson correlation coefficient of street model for water_index: r^2 = 0.2137
Squared Pearson correlation coefficient of street model for sanitation_index: r^2 = 0.26379
Squared Pearson correlation coefficient of street model for under5_mort: r^2 = 0.10356
Squared Pearson correlation coefficient of street model for asset_index: r^2 = 0.31326
Squared Pearson correlation coefficient of street model for women_edu: r^2 = 0.382
Squared Pearson correlation coefficient of street model for women_bmi: r^2 = 1e-05

street normalised test loss: 0.44135

Squared Pearson correlation coefficient of combined model for water_index: r^2 = 0.81794
Squared Pearson correlation coefficient of combined model for sanitation_index: r^2 = 0.85588
Squared Pearson correlation coefficient of combined model for under5_mort: r^2 = 0.99724
Squared Pearson correlation coefficient of combined model for asset_index: r^2 = 0.95624
Squared Pearson correlation coefficient of combined model for women_edu: r^2 = 0.97702
Squared Pearson correlation coefficient of combined model for women_bmi: r^2 = 0.95991

combined normalised test loss: 0.00162

mean = data_no_nan[column_names].mean()
total_benchmark_loss = 0
random_benchmark_loss = 0

test_loader = DataLoader(combined_dataset_test, batch_size=512, shuffle=False, num_workers=4)

random_predictions = []
actuals = []

for inputs, targets in test_loader:
    random_guess = torch.tensor((np.random.rand(targets.shape[0], 6)-0.5)*10)

    random_benchmark_loss += criterion(random_guess, targets)
    random_predictions.extend(random_guess.numpy())
    actuals.extend(targets.numpy())

# Compute Pearson correlation coefficient for each feature
for i in range(6):
    corr, _ = pearsonr([p[i] for p in random_predictions], [a[i] for a in actuals])
    print(f'Squared Pearson correlation coefficient of random benchmark for {column_names[i]}: r^2 = {corr**2}')

print(f'Random benchmark loss: {random_benchmark_loss / len(combined_dataset_test)}')

Squared Pearson correlation coefficient of random benchmark for water_index: r^2 = 4.1952912785140774e-05
Squared Pearson correlation coefficient of random benchmark for sanitation_index: r^2 = 9.294217353504598e-09
Squared Pearson correlation coefficient of random benchmark for under5_mort: r^2 = 7.390261732725432e-06
Squared Pearson correlation coefficient of random benchmark for asset_index: r^2 = 4.040565161363265e-05
Squared Pearson correlation coefficient of random benchmark for women_edu: r^2 = 8.574681306376353e-06
Squared Pearson correlation coefficient of random benchmark for women_bmi: r^2 = 2.3161961569539365e-06
Random benchmark loss: 0.7936563413002258

# Switch to evaluation mode and load the model to the CPU, as we only need a GPU if we want to make use of its parallelisation.
combined_model.eval()
combined_model.to('cpu')
    
# We only need to compute the gradient when training
# When testing, we can disable the gradient which saves time and computational resources
with torch.no_grad():
    datapoint = combined_dataset_test.__getitem__(233)
    image = datapoint[0]
    fig, axs = plt.subplots(1, 2, figsize=(20, 10))
    axs[0].set_title('Street input as RGB image')
    axs[0].imshow(image[:3].permute(1, 2, 0)*2)
    axs[1].set_title('Satellite input as THERM1 representation')
    axs[1].imshow(image[9])
    some_input = torch.unsqueeze(combined_dataset_test[0][0], dim=0)
    outputs = list(combined_model(some_input)[0])
    predictions = list(zip(column_names, outputs))

    for i in range(6):
        print(f'Predicted {predictions[i][0]}: {predictions[i][1].item()}, ground truth {datapoint[1][i]}')

Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

Predicted water_index: 4.3228020668029785, ground truth 4.0
Predicted sanitation_index: 2.4591689109802246, ground truth 2.75
Predicted under5_mort: -0.2389991134405136, ground truth 0.0
Predicted asset_index: -0.1608094573020935, ground truth -0.31347164511680603
Predicted women_edu: 6.261411666870117, ground truth 8.875
Predicted women_bmi: 26.67802619934082, ground truth 24.003999710083008

# Get the features by registering the defined hooks during the feed forward

def plot_feature_extraction(model, dataset, name):
    print(f'Extracted features for the {name} model:')
    
    model = model.to('cpu')
    activations1 = model.register_hook_to_first_layer()
    activations2 = model.register_hook_to_second_layer()
    activations4 = model.register_hook_to_last_layer()
    
    # Make sure this single forward pass occurs on the cpu
    output = model(torch.unsqueeze(dataset.__getitem__(0)[0], dim=0))
    
    # Define the grid size
    grid_size = math.floor(math.sqrt(64))
    
    # Create a figure to hold the subplots
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    fig.suptitle('Learned features after first block')
    
    # Plot each feature map
    for i, ax in enumerate(axes.flat):
        # Normalize the feature map for visualization
        feature_map = activations1[0].squeeze(0)[i]
        feature_map = feature_map - feature_map.min()
        feature_map = feature_map / feature_map.max()        
        ax.imshow(feature_map, cmap='gray')
        ax.axis('off')
    plt.show()
    print('\n')
    
    # Create a figure to hold the subplots
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    fig.suptitle('Learned features after second block')
    
    # Plot each feature map
    for i, ax in enumerate(axes.flat):
        feature_map = activations2[0].squeeze(0)[i]
        feature_map = feature_map - feature_map.min()
        feature_map = feature_map / feature_map.max()        
        ax.imshow(feature_map, cmap='gray')
        ax.axis('off')
    plt.show()
    print('\n')
    
    # Compute the grid size
    grid_size = math.floor(math.sqrt(512)) 
    
    # Create a figure to hold the subplots
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    fig.suptitle('Learned features after last block')
    
    # Plot each feature map
    for i, ax in enumerate(axes.flat):
        # Remove the extra 1-dimension
        feature_map = activations4[0].squeeze(0)[i]
        # Normalize the feature map for visualization
        feature_map = feature_map - feature_map.min()
        feature_map = feature_map / feature_map.max()        
        ax.imshow(feature_map, cmap='gray')
        ax.axis('off')
    plt.show()
    print('\n')

plot_feature_extraction(combined_model, combined_dataset, 'combined')

Extracted features for the combined model:

plot_feature_extraction(street_model, street_dataset, 'street')

Extracted features for the street model:

	year	lat	lon	n_asset	asset_index	n_water	water_index	n_sanitation	sanitation_index	under5_mort	n_under5_mort	women_edu	women_bmi	n_women_edu	n_women_bmi	cluster_id	adm1dhs
count	117644.000000	117644.000000	117644.000000	86936.000000	86936.000000	87938.000000	87938.000000	89271.000000	89271.000000	105582.000000	105582.000000	117062.000000	94866.000000	117062.000000	94866.000000	1.176440e+05	117644.000000
mean	2010.964894	10.875259	29.263579	23.914558	0.174589	23.937615	3.763723	24.009242	3.086101	18.163958	18.345021	6.354988	23.296365	24.861065	18.778098	6.487475e+05	1053.883071
std	5.301742	16.276815	54.533060	7.779958	1.848209	7.806283	1.123908	7.765566	1.282027	46.747577	12.160344	3.468181	2.946691	11.069406	9.611864	4.907284e+06	3037.622967
min	1996.000000	-30.588811	-92.176053	5.000000	-3.823164	5.000000	1.000000	5.000000	1.000000	0.000000	5.000000	0.000000	15.758333	5.000000	5.000000	1.000000e+00	0.000000
25%	2007.000000	0.057170	-0.137666	20.000000	-1.451730	20.000000	3.037037	20.000000	2.037037	0.000000	10.000000	3.750000	21.148019	18.000000	11.000000	1.900000e+02	5.000000
50%	2013.000000	11.982408	32.822764	22.000000	0.179011	22.000000	3.933333	22.000000	2.958333	0.000000	15.000000	6.333333	22.804907	24.000000	18.000000	4.660000e+02	13.000000
75%	2015.000000	24.617686	77.497416	27.000000	1.842407	27.000000	4.826087	27.000000	4.272727	0.000000	23.000000	8.916667	24.898071	30.000000	24.000000	5.098325e+04	31.000000
max	2019.000000	48.436031	126.842321	108.000000	3.607050	108.000000	5.000000	108.000000	5.000000	692.307692	166.000000	17.800000	48.111667	130.000000	118.000000	7.571352e+07	9999.000000

Predicting SDG indicators using Deep Learning¶

Previous work¶

Exploratory data analysis¶

Overview of the data¶

Spacial representation of the dataset¶

Spacial representation of the features¶

Mean value of features over countries¶

Correlation between features¶

Visualising the covariance between features¶

Satellite image data¶

Street images¶

Task and evaluation¶

Plan of action¶

Hardware¶

Design and build an ML system¶

Preprocess data¶

Defining the model¶

Defining the datasets¶

Defining the train function¶

Defining the transforms, models and datasets¶

Training the model¶

Experimental analysis (performance & scalability)¶

Tuning the hyperparameters¶

Plotting the loss over the test and validation data¶

Performance on the test data¶

Comparison to random predictions¶

Performing inference¶

Visualising the learned features¶

Ethical considerations¶

Interpretability¶

Biased predictions¶

Gerrymandering¶

Uncertainty quantification¶

Consequences of predictions based on mere visual observations¶

Sustainable development relevance & impact¶

Relevance of the exploratory data analysis¶

Relationship of the dataset to the SDGs¶

Water quality index¶

Sanitation index¶

Child mortality rate¶

Asset index¶

Women's average educational attainment in years¶

Women's average BMI¶

Where machine learning can help¶

Novelty of Project¶