Prepare data and 1D analysis¶
This notebook contains the creation of the feature space and the 1d histogram analysis (t-test, median values and 1dOT).
Furthermore, the Gaussian approximations and the linear embedding are calculated in preparation for the multidimensional analysis with OT (see nb2b_OT_analysis.ipynb
).
In [1]:
import numpy as np
from numpy import mean, std, median, amax, pi, exp, append, argmax, save, savetxt, loadtxt
from numpy import array, column_stack, zeros, sin, cos, sqrt, load, ones, concatenate
import ot
import matplotlib.pyplot as plt
from numpy.random import multivariate_normal
from scipy.spatial import cKDTree
from time import time
import os
In [2]:
## settings for figures
import matplotlib.font_manager as fm
import matplotlib as mpl
# Edit font size, and axes width
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
In [3]:
# Description which coloum contains which feature.
# 0,1,2: X, Center of Geometry (µm) Y, Center of Geometry (µm) Z, Center of Geometry (µm)
# 3: Mean, Intensities
# 4: Standard Dev., Intensities
# 5: Volume, Volume (µm³)
# 6: Sphericity
# 7: Number of neighbors
# 8: Distance to nearest neighbor
In [4]:
## define datapaths
txt_data = 'Data/txt-data/'
fileroot = 'Data/Data-for-TransportAnalysis/'
resultpath = 'Results/'
if not os.path.isdir(resultpath):
os.makedirs(resultpath)
In [5]:
# load in txt tables. The columns contain the features, rows correspond to single cells
# every file is one patient
#Group1 - CTRL
data1 = loadtxt(txt_data + '6716.txt', skiprows = 1)
data2 = loadtxt(txt_data + '6916.txt', skiprows = 1)
data3 = loadtxt(txt_data + '2616.txt', skiprows = 1)
data4 = loadtxt(txt_data + '5416.txt', skiprows = 1)
data5 = loadtxt(txt_data + '12816.txt', skiprows = 1)
data6 = loadtxt(txt_data + '6515.txt', skiprows = 1)
#Group2 - MS
data7 = loadtxt(txt_data + '12006.txt', skiprows = 1)
data8 = loadtxt(txt_data + 'T800.txt', skiprows = 1)
data9 = loadtxt(txt_data + '11313.txt', skiprows = 1)
data10 = loadtxt(txt_data + '13609.txt', skiprows = 1)
data11 = loadtxt(txt_data + '22091.txt', skiprows = 1)
data12 = loadtxt(txt_data + '3614.txt', skiprows = 1)
data = [data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12]
In [6]:
# define name of the features , tags and corresponding classes.
labels = ['X-Position', 'Y-Position', 'Z-Position', 'Volume', 'Electron density', 'Heterogeneity', 'Sphericity', 'Num-Neighbors', 'NN-Distance']
sampleTags = ['6716', '6916', '2616', '5416', '12816', '6515', '12006', 'T800', '11313', '13609', '22091', '3614']
classes = [0,0,0,0,0,0,1,1,1,1,1,1]
In [7]:
# list of indices of feature columns used in analysis
list_f = [3,4,5,6,7,8]
In [8]:
## determine two features out of the nuclei positions:
# number of nearest neighbors within a radius (7.45µm) and distances to nearest neighbor
posdata = []
for i in range(len(data)):
posdata.append(data[i][:,0:3])
# done with cKDtree
def nearest_neighbors_stats(x, r):
# creates the tree
tree = cKDTree(x)
# number of neighbors in shell
# iterates over all lists of pairs and returns the length of this list
distances = tree.query_ball_point(x, r)
d_counts = array([len(d) - 1 for d in distances])
# calculates distance to nearest neighbor
nn, ii = tree.query(x, k=[2])
return d_counts, nn
# calculate number of neighbors in shell NN
for d, i in zip(posdata, range(len(posdata))):
d_counts, nn = nearest_neighbors_stats(d, 7.45) # value of 7.45 microns determined by first local minimum of radial distributions function
# appending both features to data
data[i] = column_stack((data[i], d_counts))
data[i] = column_stack((data[i], nn))
In [9]:
## perform t-test for all features, print p values and medians of unstandartisized features for both groups in console
from scipy.stats import ttest_ind
# Note: since the feature number of neighbors always takes on integer values,
# the mean of the population is more expressive than the median of it.
for s in list_f:
medians = []
means = []
for d in data:
medians.append(median(d[:, s]))
means.append(mean(d[:, s]))
# apply Welch's two-sided t-test
if s == 7:
res = ttest_ind(means[0:6], means[6:12], equal_var=False)
else:
res = ttest_ind(medians[0:6], medians[6:12], equal_var=False)
# print p-values
print('p-Value of ' + labels[s] + ':', res[1])
# print population medians and standard-deviation
if s == 7:
print('--ALL--')
print(mean(means[0:12]))
print(std(means[0:12]))
print('--CTRL--')
print(mean(means[0:6]))
print(std(means[0:6]))
print('--MS--')
print(mean(means[6:12]))
print(std(means[6:12]))
print('___________')
else:
print('--ALL--')
print(mean(medians[0:12]))
print(std(medians[0:12]))
print('--CTRL--')
print(mean(medians[0:6]))
print(std(medians[0:6]))
print('--MS--')
print(mean(medians[6:12]))
print(std(medians[6:12]))
print('___________')
In [10]:
## Standartisation with z-score
from scipy.stats import zmap
# Index of features to standartisize:
for i in list_f:
#creates the total population (all nuclei of all patients) for each feature i
ges = []
for d in data:
ges = concatenate((ges,d[:,i]))
#standartisize the patient-population with total population
for d in data:
d[:,i] = zmap(d[:,i], ges)
In [11]:
## plotting histograms of single features from one patient exemplarily
for s in list_f:
plt.figure()
plt.grid(zorder=0)
plt.hist(data[0][:,s], bins = 70, density=True, zorder =3, ec = 'black', fc = 'blue')
plt.xlabel(labels[s], fontsize = 20)
plt.ylabel('Frequency',fontsize = 20 )
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.xlim(-2.7,5.2)
ax = plt.gca()
ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='out')
ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='out')
os.makedirs(resultpath + 'Histograms', exist_ok = True)
plt.savefig(resultpath + 'Histograms/' + labels[s] + '.svg', bbox_inches='tight',pad_inches = 0.02, transparent = True)
plt.show()
In [12]:
## draw point cloud and ellipses in 2d projection of feature space
fig = plt.figure(2)
# select features for 2d plane
i = 3
j = 6
for x in range(1):
plt.title('Feature Space', fontsize=20)
plt.xlabel(labels[i], fontsize=20)
plt.ylabel(labels[j], fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
#for i=3, j=6
plt.xlim(-1.8, 3.6)
plt.ylim(-2.5, 2.1)
#for i=4-, j=5
#plt.xlim(-2.6, 1.4)
#plt.ylim(-3.2, 3.3)
#plt.yticks([-2,0,2])
ax = plt.gca()
ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='in')
ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='in')
colors = ['blue', 'cyan', 'dodgerblue', 'royalblue', 'slateblue', 'lightskyblue',
'red', 'tomato', 'crimson', 'darkorange', 'darkred', 'firebrick']
# pick patient
k = 0
# draw ellipse by computing empirical covariance and solving eigen-problem
cov = np.cov(data[k].T)
# pick covariance entries of 2d Subspace
cov2x2 = np.reshape([cov[i,i], cov[i,j], cov[j,i], cov[j,j]], (2,2))
eigenvalues, eigenvectors = np.linalg.eig(cov2x2)
theta = np.linspace(0, 2*pi, 1000)
ellipsis = (np.sqrt(eigenvalues[None,:]) * eigenvectors) @ [np.sin(theta), np.cos(theta)]
# draw point cloud
plt.plot(data[k][:,i], data[k][:,j] , '.', ms=0.22, c=colors[0])
# draw ellipse
plt.plot(ellipsis[0, :] + mean(data[k][:, i]), ellipsis[1, :] + mean(data[k][:, j]), label=k, c=colors[0], lw = 2.5)
os.makedirs(resultpath + 'Feature Space', exist_ok=True)
plt.savefig(resultpath + 'Feature Space/FeatureSpace-'+labels[i]+'-'+labels[j]+'.png', bbox_inches='tight', pad_inches=0.02, transparent=True, dpi=300)
plt.show()
In [13]:
## draw ellipses - as representation of the gaussian approximation - of all patient in 2d subspace
plt.figure(figsize=(7,6))
# select features for 2d plane
i = 3
j = 5
for x in range(1):
plt.title('Feature Space', fontsize=16)
plt.xlabel(labels[i], fontsize=16)
plt.ylabel(labels[j], fontsize=16)
#for i=4, j=5
#plt.xlim(-2.4, 2.8)
#plt.ylim(-2.6, 2.6)
#for i=3, j=5
plt.xlim(-1.8, 3.5)
plt.ylim(-2.6, 2.6)
#plt.yticks([-2,0,2])
ax = plt.gca()
ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='in')
ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='in')
ax.grid(ls='--')
colors = ['blue', 'cyan', 'dodgerblue', 'royalblue', 'slateblue', 'lightskyblue',
'red', 'tomato', 'crimson', 'darkorange', 'darkred', 'firebrick']
# for all patient draw an ellipse
for k in range(0,12):
# compute empirical covariance matrix
cov = np.cov(data[k].T)
# pick covariance entries of 2d Subspace
cov2x2 = np.reshape([cov[i,i], cov[i,j], cov[j,i], cov[j,j]], (2,2))
eigenvalues, eigenvectors = np.linalg.eig(cov2x2)
theta = np.linspace(0, 2*pi, 1000)
ellipsis = (np.sqrt(eigenvalues[None,:]) * eigenvectors) @ [np.sin(theta), np.cos(theta)]
plt.plot(ellipsis[0,:] + mean(data[k][:,i]), ellipsis[1,:] + mean(data[k][:,j]), label = k+1, c=colors[k], lw = 2.5 )
plt.legend(bbox_to_anchor=(1,1), loc=1, frameon=False, fontsize=12, handletextpad = 0.4)
os.makedirs(resultpath + 'Ellipses', exist_ok=True)
plt.savefig(resultpath + 'Ellipses/Ellipses-'+labels[i]+'-'+labels[j]+'.svg', bbox_inches='tight', pad_inches=0.02, transparent=True, dpi=400)
plt.show()
In [14]:
## plot median values
fig = plt.figure(figsize = (12,7))
X1 = ones(6) * 0.5
X2 = ones(6) * 1
ax = fig.add_subplot(111)
ax.set_ylabel('Median over population ', fontsize=26)
# Turn off axis lines and ticks of the big subplot
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.tick_params(labelcolor='w', top=False, bottom=False, left=False, right=False)
# for each of the six feature
for k, s in enumerate(list_f):
# plot median over population from each patients
medians = []
means = [] # only for features NN -> mean instead of median
for d in data:
medians.append(median(d[:,s]))
means.append(mean(d[:,s]))
fig.add_subplot(2, 3, k + 1)
if k == 4:
plt.plot(X1, means[0:6], '_', ms=30, lw=100, c='blue', label='CTRL')
plt.plot(X2, means[6:12], '_', ms=30, c='red', label='MS')
plt.plot(X1[0], mean(means[0:6]), '_', ms=50, c='black')
plt.plot(X2[0], mean(means[6:12]), '_', ms=50, c='black')
else:
plt.plot(X1, medians[0:6], '_', ms=30, lw= 100, c = 'blue', label='CTRL')
plt.plot(X2, medians[6:12], '_', ms=30, c='red', label='MS')
plt.plot(X1[0], mean(medians[0:6]), '_', ms=50, c = 'black' )
plt.plot(X2[0], mean(medians[6:12]), '_', ms=50, c = 'black' )
plt.axvline(x=0.5, ls='--', c='grey')
plt.axvline(x=1, ls='--', c='grey')
ax = plt.gca()
ax.xaxis.set_tick_params(which='major', size=6, width=2, direction='in')
ax.yaxis.set_tick_params(which='major', size=6, width=2, direction='in')
plt.title(labels[s])
plt.xlim(0,1.5)
plt.xticks(ticks = [0.5,1], labels = ['CTRL', 'MS'])
fig.tight_layout()
os.makedirs(resultpath + 'Median-Values', exist_ok=True)
plt.savefig(resultpath + 'Median-Values/Median-Values.svg', bbox_inches='tight', pad_inches=0.02, transparent = True)
plt.show()
In [15]:
## create Violinplots
# for each feature:
for j,s in enumerate(list_f):
v = []
meds = []
cb = []
ct = []
q1 = []
q3 = []
# since data are very scattered, an IQR filter is applied which cuts off the outermost 5% of the edges
# from filterd data first and third quantile is calculated, shown as black bars in the plot
for d in data:
cut_bottom, med, cut_top = np.percentile(d[:, s], [5, 50, 95])
# filter data
filter = d[:,s].copy()
filter = filter[filter > cut_bottom]
filter = filter[filter < cut_top]
#filtered data and mix/max of it
v.append(filter)
cb.append(cut_bottom)
ct.append(cut_top)
#save medians
meds.append(med)
# from filtered data compute first and third quantile
q1.append(np.percentile(filter, [25]))
q3.append(np.percentile(filter, [75]))
plt.figure(figsize=(12, 5))
ax = plt.gca()
ax.set_axisbelow(True)
ax.grid(ls='--')
vplots = plt.violinplot(v, showmedians=False, showextrema=False, points = 200)
inds = np.arange(1, len(meds) + 1)
# plot medians as white dots
plt.scatter(inds, meds, marker='o', color='white', s=30, zorder=3)
plt.vlines(inds, array(cb), array(ct), color='Black', linestyle='-', lw=1)
#plot quantiles
plt.vlines(inds, array(q1), array(q3), color='Black', linestyle='-', lw=5)
plt.xticks(ticks = inds, labels = inds, fontsize = 15 )
plt.yticks(fontsize=15)
plt.xlabel('Subject Number', fontsize = 26)
plt.ylabel(labels[s], fontsize = 26)
# set colors
for pc,i in zip(vplots['bodies'],range(12)):
if i < 6:
pc.set_facecolor('blue')
pc.set_edgecolor('Darkblue')
pc.set_alpha(0.75)
else:
pc.set_facecolor('red')
pc.set_edgecolor('Darkred')
pc.set_alpha(0.75)
# customize legend
import matplotlib.patches as mpatches
red_patch = mpatches.Patch(color='red')
blue_patch = mpatches.Patch(color='blue')
l = ['Control', 'MS']
legend_pos = [2,2,2,3,2,1]
plt.legend(loc=legend_pos[j], handles=[blue_patch,red_patch], labels=l, fontsize=13, handletextpad=0.35, edgecolor='black', frameon=True, fancybox=False)
os.makedirs(resultpath + 'Violin-Plots', exist_ok=True)
plt.savefig(resultpath + 'Violin-Plots/' + labels[s] + '-Violinplot.svg', bbox_inches='tight', pad_inches=0.02, transparent = True)
plt.show()