Prepare data and 1D analysis

This notebook contains the creation of the feature space and the 1d histogram analysis (t-test, median values and 1dOT).
Furthermore, the Gaussian approximations and the linear embedding are calculated in preparation for the multidimensional analysis with OT (see nb2b_OT_analysis.ipynb).

In [1]:
import numpy as np
from numpy import mean, std, median, amax, pi, exp, append, argmax, save, savetxt, loadtxt
from numpy import array, column_stack, zeros, sin, cos, sqrt, load, ones, concatenate
import ot
import matplotlib.pyplot as plt
from numpy.random import multivariate_normal
from scipy.spatial import cKDTree
from time import time
import os
In [2]:
## settings for figures

import matplotlib.font_manager as fm
import matplotlib as mpl

# Edit font size, and axes width
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
In [3]:
# Description which coloum contains which feature.
# 0,1,2: X, Center of Geometry (µm)	Y, Center of Geometry (µm)	Z, Center of Geometry (µm)
# 3: Mean, Intensities
# 4: Standard Dev., Intensities
# 5: Volume, Volume (µm³)
# 6: Sphericity
# 7: Number of neighbors
# 8: Distance to nearest neighbor
In [4]:
## define datapaths
txt_data = 'Data/txt-data/'
fileroot = 'Data/Data-for-TransportAnalysis/'
resultpath = 'Results/'

if not os.path.isdir(resultpath):
    os.makedirs(resultpath)
In [5]:
# load in txt tables. The columns contain the features, rows correspond to single cells
# every file is one patient

#Group1 - CTRL
data1 = loadtxt(txt_data + '6716.txt', skiprows = 1)
data2 = loadtxt(txt_data + '6916.txt', skiprows = 1)
data3 = loadtxt(txt_data + '2616.txt', skiprows = 1)
data4 = loadtxt(txt_data + '5416.txt', skiprows = 1)
data5 = loadtxt(txt_data + '12816.txt', skiprows = 1)
data6 = loadtxt(txt_data + '6515.txt', skiprows = 1)

#Group2 - MS
data7 = loadtxt(txt_data + '12006.txt', skiprows = 1)
data8 = loadtxt(txt_data + 'T800.txt', skiprows = 1)
data9 = loadtxt(txt_data + '11313.txt', skiprows = 1)
data10 = loadtxt(txt_data + '13609.txt', skiprows = 1)
data11 = loadtxt(txt_data + '22091.txt', skiprows = 1)
data12 = loadtxt(txt_data + '3614.txt', skiprows = 1)

data = [data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12]
In [6]:
# define name of the features , tags and corresponding classes.
labels = ['X-Position', 'Y-Position', 'Z-Position', 'Volume', 'Electron density', 'Heterogeneity', 'Sphericity', 'Num-Neighbors', 'NN-Distance']
sampleTags = ['6716', '6916', '2616', '5416', '12816', '6515', '12006', 'T800', '11313', '13609', '22091', '3614']
classes = [0,0,0,0,0,0,1,1,1,1,1,1]
In [7]:
# list of indices of feature columns used in  analysis
list_f = [3,4,5,6,7,8]
In [8]:
## determine two features out of the nuclei positions:
# number of nearest neighbors within a radius (7.45µm) and distances to nearest neighbor


posdata = []
for i in range(len(data)):
    posdata.append(data[i][:,0:3])

# done with cKDtree
def nearest_neighbors_stats(x, r):
    # creates the tree
    tree = cKDTree(x)

    # number of neighbors in shell
    # iterates over all lists of pairs and returns the length of this list
    distances = tree.query_ball_point(x, r)
    d_counts = array([len(d) - 1 for d in distances])

    # calculates distance to nearest neighbor
    nn, ii = tree.query(x, k=[2])

    return d_counts, nn

# calculate number of neighbors in shell NN
for d, i in zip(posdata, range(len(posdata))):
    d_counts, nn = nearest_neighbors_stats(d, 7.45)    # value of 7.45 microns determined by first local minimum of radial distributions function

    # appending both features to data
    data[i] = column_stack((data[i], d_counts))
    data[i] = column_stack((data[i], nn))
In [9]:
## perform t-test for all features, print p values and medians of unstandartisized features for both groups in console
from scipy.stats import ttest_ind


# Note: since the feature number of neighbors always takes on integer values,
# the mean of the population is more expressive than the median of it.

for s in list_f:

    medians = []
    means = []

    for d in data:
        medians.append(median(d[:, s]))
        means.append(mean(d[:, s]))

    # apply Welch's two-sided t-test
    if s == 7:
        res = ttest_ind(means[0:6], means[6:12], equal_var=False)
    else:
        res = ttest_ind(medians[0:6], medians[6:12], equal_var=False)

    # print p-values
    print('p-Value of ' + labels[s] + ':', res[1])

   # print population medians and standard-deviation
    if s == 7:
        print('--ALL--')
        print(mean(means[0:12]))
        print(std(means[0:12]))

        print('--CTRL--')
        print(mean(means[0:6]))
        print(std(means[0:6]))

        print('--MS--')
        print(mean(means[6:12]))
        print(std(means[6:12]))

        print('___________')

    else:
        print('--ALL--')
        print(mean(medians[0:12]))
        print(std(medians[0:12]))

        print('--CTRL--')
        print(mean(medians[0:6]))
        print(std(medians[0:6]))

        print('--MS--')
        print(mean(medians[6:12]))
        print(std(medians[6:12]))

        print('___________')
p-Value of Volume: 0.28321472462861425
--ALL--
26.83089484333333
4.410673808858973
--CTRL--
28.353984208333333
5.347978077080411
--MS--
25.307805478333332
2.3806753150785043
___________
p-Value of Electron density: 0.2255301193645871
--ALL--
31.228847882083333
5.383980997603107
--CTRL--
29.192974651666663
5.019227798764506
--MS--
33.264721112500006
4.948969129885326
___________
p-Value of Heterogeneity: 0.0010978853197003086
--ALL--
8.784344697083332
0.9422884562319336
--CTRL--
8.011252402083334
0.5610520934610334
--MS--
9.557436992083334
0.5154534166558398
___________
p-Value of Sphericity: 0.5692073017109736
--ALL--
0.624443603125
0.009861512399345731
--CTRL--
0.6262655676666666
0.006543750954637025
--MS--
0.6226216385833333
0.01204321665778728
___________
p-Value of Num-Neighbors: 0.2808374913049974
--ALL--
7.243187016200025
1.204002072805068
--CTRL--
6.828422827967451
0.7540577048155545
--MS--
7.657951204432599
1.40946099512035
___________
p-Value of NN-Distance: 0.39128960309118455
--ALL--
4.377236832167613
0.2180066307421607
--CTRL--
4.436696444036097
0.19859700253184773
--MS--
4.31777722029913
0.2203227673059717
___________
In [10]:
## Standartisation with z-score


from scipy.stats import zmap


# Index of features to standartisize:
for i in list_f:

    #creates the total population (all nuclei of all patients) for each feature i
    ges = []
    for d in data:
        ges = concatenate((ges,d[:,i]))

    #standartisize the patient-population with total population
    for d in data:
        d[:,i] = zmap(d[:,i], ges)
In [11]:
## plotting histograms of single features from one patient exemplarily


for s in list_f:
    plt.figure()
    plt.grid(zorder=0)
    plt.hist(data[0][:,s], bins = 70, density=True, zorder =3, ec = 'black', fc = 'blue')
    plt.xlabel(labels[s], fontsize = 20)
    plt.ylabel('Frequency',fontsize = 20 )
    plt.xticks(fontsize = 15)
    plt.yticks(fontsize = 15)
    plt.xlim(-2.7,5.2)

    ax = plt.gca()
    ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='out')
    ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='out')

    os.makedirs(resultpath + 'Histograms', exist_ok = True)
    plt.savefig(resultpath + 'Histograms/' + labels[s] + '.svg', bbox_inches='tight',pad_inches = 0.02, transparent = True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [12]:
## draw point cloud and ellipses in 2d projection of feature space


fig = plt.figure(2)

# select features for 2d plane
i = 3
j = 6

for x in range(1):

    plt.title('Feature Space', fontsize=20)
    plt.xlabel(labels[i], fontsize=20)
    plt.ylabel(labels[j], fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)

    #for i=3, j=6
    plt.xlim(-1.8, 3.6)
    plt.ylim(-2.5, 2.1)

    #for i=4-, j=5
    #plt.xlim(-2.6, 1.4)
    #plt.ylim(-3.2, 3.3)
    #plt.yticks([-2,0,2])

    ax = plt.gca()
    ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='in')
    ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='in')
    colors = ['blue', 'cyan', 'dodgerblue', 'royalblue', 'slateblue', 'lightskyblue',
                        'red', 'tomato', 'crimson', 'darkorange', 'darkred', 'firebrick']

    # pick patient
    k = 0

    # draw ellipse by computing empirical covariance and solving eigen-problem
    cov = np.cov(data[k].T)
    # pick covariance entries of 2d Subspace
    cov2x2 = np.reshape([cov[i,i], cov[i,j], cov[j,i], cov[j,j]], (2,2))

    eigenvalues, eigenvectors = np.linalg.eig(cov2x2)

    theta = np.linspace(0, 2*pi, 1000)
    ellipsis = (np.sqrt(eigenvalues[None,:]) * eigenvectors) @ [np.sin(theta), np.cos(theta)]

    # draw point cloud
    plt.plot(data[k][:,i], data[k][:,j] , '.', ms=0.22,  c=colors[0])
    # draw ellipse
    plt.plot(ellipsis[0, :] + mean(data[k][:, i]), ellipsis[1, :] + mean(data[k][:, j]), label=k, c=colors[0], lw = 2.5)

    os.makedirs(resultpath + 'Feature Space', exist_ok=True)
    plt.savefig(resultpath + 'Feature Space/FeatureSpace-'+labels[i]+'-'+labels[j]+'.png', bbox_inches='tight', pad_inches=0.02, transparent=True, dpi=300)
    plt.show()
No description has been provided for this image
In [13]:
## draw ellipses - as representation of the gaussian approximation - of all patient in 2d subspace


plt.figure(figsize=(7,6))

# select features for 2d plane
i = 3
j = 5

for x in range(1):

    plt.title('Feature Space', fontsize=16)
    plt.xlabel(labels[i], fontsize=16)
    plt.ylabel(labels[j], fontsize=16)

    #for i=4, j=5
    #plt.xlim(-2.4, 2.8)
    #plt.ylim(-2.6, 2.6)

    #for i=3, j=5
    plt.xlim(-1.8, 3.5)
    plt.ylim(-2.6, 2.6)
    #plt.yticks([-2,0,2])

    ax = plt.gca()
    ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='in')
    ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='in')
    ax.grid(ls='--')
    colors = ['blue', 'cyan', 'dodgerblue', 'royalblue', 'slateblue', 'lightskyblue',
                        'red', 'tomato', 'crimson', 'darkorange', 'darkred', 'firebrick']

    # for all patient draw an ellipse
    for k in range(0,12):

        # compute empirical covariance matrix
        cov = np.cov(data[k].T)

        # pick covariance entries of 2d Subspace
        cov2x2 = np.reshape([cov[i,i], cov[i,j], cov[j,i], cov[j,j]], (2,2))

        eigenvalues, eigenvectors = np.linalg.eig(cov2x2)

        theta = np.linspace(0, 2*pi, 1000)
        ellipsis = (np.sqrt(eigenvalues[None,:]) * eigenvectors) @ [np.sin(theta), np.cos(theta)]

        plt.plot(ellipsis[0,:] + mean(data[k][:,i]), ellipsis[1,:] + mean(data[k][:,j]), label = k+1, c=colors[k], lw = 2.5 )


    plt.legend(bbox_to_anchor=(1,1), loc=1, frameon=False, fontsize=12, handletextpad = 0.4)

    os.makedirs(resultpath + 'Ellipses', exist_ok=True)
    plt.savefig(resultpath + 'Ellipses/Ellipses-'+labels[i]+'-'+labels[j]+'.svg', bbox_inches='tight', pad_inches=0.02, transparent=True, dpi=400)
    plt.show()
No description has been provided for this image
In [14]:
##  plot median values


fig = plt.figure(figsize = (12,7))

X1 = ones(6) * 0.5
X2 = ones(6) * 1

ax = fig.add_subplot(111)
ax.set_ylabel('Median over population ', fontsize=26)
# Turn off axis lines and ticks of the big subplot
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.tick_params(labelcolor='w', top=False, bottom=False, left=False, right=False)


# for each of the six feature
for k, s in enumerate(list_f):

    # plot median over population from each patients
    medians = []
    means = []              # only for features NN -> mean instead of median

    for d in data:
        medians.append(median(d[:,s]))
        means.append(mean(d[:,s]))

    fig.add_subplot(2, 3, k + 1)
    if k == 4:
        plt.plot(X1, means[0:6], '_', ms=30, lw=100, c='blue', label='CTRL')
        plt.plot(X2, means[6:12], '_', ms=30, c='red', label='MS')
        plt.plot(X1[0], mean(means[0:6]), '_', ms=50, c='black')
        plt.plot(X2[0], mean(means[6:12]), '_', ms=50, c='black')
    else:
        plt.plot(X1, medians[0:6], '_', ms=30, lw= 100, c = 'blue', label='CTRL')
        plt.plot(X2, medians[6:12], '_', ms=30, c='red', label='MS')
        plt.plot(X1[0], mean(medians[0:6]), '_', ms=50, c = 'black' )
        plt.plot(X2[0], mean(medians[6:12]), '_', ms=50, c = 'black' )
    plt.axvline(x=0.5, ls='--', c='grey')
    plt.axvline(x=1, ls='--', c='grey')
    ax = plt.gca()
    ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='in')
    ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='in')
    plt.title(labels[s])
    plt.xlim(0,1.5)
    plt.xticks(ticks = [0.5,1], labels = ['CTRL', 'MS'])

fig.tight_layout()

os.makedirs(resultpath + 'Median-Values', exist_ok=True)
plt.savefig(resultpath + 'Median-Values/Median-Values.svg',  bbox_inches='tight', pad_inches=0.02, transparent = True)
plt.show()
No description has been provided for this image
In [15]:
## create Violinplots


# for each feature:
for j,s in enumerate(list_f):
    v = []
    meds = []
    cb = []
    ct = []
    q1 = []
    q3 = []

    # since data are very scattered, an IQR filter is applied which cuts off the outermost 5% of the edges
    # from filterd data first and third quantile is calculated, shown as black bars in the plot
    for d in data:
        cut_bottom, med, cut_top = np.percentile(d[:, s], [5, 50, 95])

        # filter data
        filter = d[:,s].copy()
        filter = filter[filter > cut_bottom]
        filter = filter[filter < cut_top]

        #filtered data and mix/max of it
        v.append(filter)
        cb.append(cut_bottom)
        ct.append(cut_top)
        #save medians
        meds.append(med)
        # from filtered data compute first and third quantile
        q1.append(np.percentile(filter, [25]))
        q3.append(np.percentile(filter, [75]))


    plt.figure(figsize=(12, 5))

    ax = plt.gca()
    ax.set_axisbelow(True)
    ax.grid(ls='--')
    vplots = plt.violinplot(v, showmedians=False, showextrema=False, points = 200)

    inds = np.arange(1, len(meds) + 1)
    # plot medians as white dots
    plt.scatter(inds, meds, marker='o', color='white', s=30, zorder=3)
    plt.vlines(inds, array(cb), array(ct), color='Black', linestyle='-', lw=1)
    #plot quantiles
    plt.vlines(inds, array(q1), array(q3), color='Black', linestyle='-', lw=5)

    plt.xticks(ticks = inds, labels = inds, fontsize = 15 )
    plt.yticks(fontsize=15)
    plt.xlabel('Subject Number', fontsize = 26)
    plt.ylabel(labels[s], fontsize = 26)

    # set colors
    for pc,i in zip(vplots['bodies'],range(12)):
        if i < 6:
            pc.set_facecolor('blue')
            pc.set_edgecolor('Darkblue')
            pc.set_alpha(0.75)
        else:
            pc.set_facecolor('red')
            pc.set_edgecolor('Darkred')
            pc.set_alpha(0.75)

    # customize legend
    import matplotlib.patches as mpatches
    red_patch = mpatches.Patch(color='red')
    blue_patch = mpatches.Patch(color='blue')
    l = ['Control', 'MS']
    legend_pos = [2,2,2,3,2,1]
    plt.legend(loc=legend_pos[j], handles=[blue_patch,red_patch], labels=l, fontsize=13, handletextpad=0.35, edgecolor='black', frameon=True, fancybox=False)

    os.makedirs(resultpath + 'Violin-Plots', exist_ok=True)
    plt.savefig(resultpath + 'Violin-Plots/' + labels[s] + '-Violinplot.svg', bbox_inches='tight', pad_inches=0.02, transparent = True)
    plt.show()
No description has been provided for this image