www.sciencesignaling.org/cgi/content/full/9/432/re6/DC1 Supplementary Materials for Avoiding common pitfalls when clustering biological data Tom Ronan, Zhijie Qi, Kristen M. Naegle* *Corresponding author. Email: [email protected] Published 14 June 2016, Sci. Signal. 9, re6 (2016) DOI: 10.1126/scisignal.aad1932 The PDF file includes: File S1. Output of the iPython Notebook that generates all examples in this review. File S1. Output of the iPython Notebook that generates all examples in this review. Executable code and file dependencies are available for checkout from the github repository: https://github.com/knaegle/clusteringReview Clustering Review March 10, 2016 In [1]: ## ## ## ## ## ## Avoiding Common Pitfalls When Clustering Biological Data A guide to avoiding common pitfalls when clustering high-throughput biological data. Authors: Tom Ronan, Zhijie Qi, Kristen M. Naegle Supplemental Materials: iPython Notebook with data analysis and code to generate toy data sets, and all toy and real data analysis. ## requires ’Common_Affy.txt’ and ’Common_miRNA.txt’ from Lu, et al. (2005), and ## ’MRM_export_exp30_10_11_11_noStddev.txt’ from Naegle, et al. (2009) ## Dependencies %matplotlib inline import matplotlib.pyplot as plt import pandas as pd import numpy as np import pylab from matplotlib import colors import matplotlib.patches as patches import scipy.cluster.hierarchy as sch import scipy.spatial.distance as ssd from sklearn.decomposition import PCA as sklearnPCA import sklearn.metrics.pairwise as pwdist from mpl_toolkits.mplot3d import Axes3D from from from from sklearn import cluster, datasets sklearn.neighbors import kneighbors_graph sklearn.preprocessing import StandardScaler sklearn import mixture from scipy.cluster.hierarchy import fcluster import itertools as it # change to reflect input file location inputdir = ’./’ ### Data processing for Lu (2005) ## load mRNA 1 fileName = ’Common_Affy.txt’ raw_mRNA89 = pd.read_csv(fileName, sep=’\t’,skiprows=2) raw_mRNA89.set_index(’Name’, inplace=True) raw_mRNA89.drop(’Description’, axis=1, inplace=True) raw_mRNA89_filtered=raw_mRNA89[~(raw_mRNA89<7.25).all(axis=1)] ## load miRNA fileName = ’Common_miRNA.txt’ raw_miRNA89 = pd.read_csv(fileName, sep=’\t’,skiprows=2) raw_miRNA89.set_index(’Name’, inplace=True) raw_miRNA89.drop(’Description’, axis=1, inplace=True) raw_miRNA89_filtered=raw_miRNA89[~(raw_miRNA89<7.25).all(axis=1)] ### Data Processing for Naegle (2009) ## load mRNA fileName = ’MRM_export_exp30_10_11_11_noStddev.txt’ raw_phosprot = pd.read_csv(fileName, sep=’\t’,skiprows=0) raw_phosprot.set_index([’gene_site’,’MS_id’,’pep’], inplace=True) raw_phosprot.drop(’run’, axis=1, inplace=True) # function necessary to plot subnested axes in matplotlib def add_subplot_axes(ax,rect,axisbg=’w’): fig = plt.gcf() box = ax.get_position() width = box.width height = box.height inax_position = ax.transAxes.transform(rect[0:2]) transFigure = fig.transFigure.inverted() infig_position = transFigure.transform(inax_position) x = infig_position[0] y = infig_position[1] width *= rect[2] height *= rect[3] subax = fig.add_axes([x,y,width,height],axisbg=axisbg) return subax In [2]: ## Cluster Review Figure 1, Panels A and B ## Dimensionality np.random.seed(7) def randrange(n, vmin, vmax): return (vmax-vmin)*np.random.rand(n) + vmin fig=plt.figure(figsize=(14,8)) ## Data Schema D2 = raw_mRNA89_filtered D2_mc = D2.sub(D2.mean(axis=1),axis=0) D2_norm = D2_mc.div(D2.std(axis=1),axis=0) 2 D2_norm.dropna(thresh=2, inplace=True) D2_column_labels = D2.columns.tolist() # Lu row diagram axmatrix = fig.add_axes([0,0.5,.08,.4]) hm = D2_norm im = axmatrix.matshow(hm, aspect=’auto’, origin=’upper’, cmap=’bwr’, vmin=-3, vmax=3) axmatrix.set_xticks([]) axmatrix.set_yticks([]) axmatrix.set_title(’\’Gene\’\nClustering’, y=1.15,size=10) axmatrix.set_ylabel(str(raw_mRNA89_filtered.shape[0])+’ genes’) axmatrix.set_xlabel(str(raw_mRNA89_filtered.shape[1])+’ cell lines’) axmatrix.xaxis.set_label_position(’top’) # Lu column diagram axmatrix2 = fig.add_axes([.16,.73,.26,.16]) hm = D2_norm.transpose() im = axmatrix2.matshow(hm, aspect=’auto’, origin=’upper’, cmap=’bwr’, vmin=-3, vmax=3) axmatrix2.set_xticks([]) axmatrix2.set_yticks([]) axmatrix2.set_title(’\’Cell Line\’\nClustering’, y=1.4,size=10) axmatrix2.set_ylabel(str(raw_mRNA89_filtered.shape[1])+’ cell lines’) axmatrix2.set_xlabel(str(raw_mRNA89_filtered.shape[0])+’ genes’) axmatrix2.xaxis.set_label_position(’top’) # Lu transpose arrow ax_arrow = fig.add_axes([.08,0.5,.4,.22]) ax_arrow.axis(’off’) p = patches.FancyArrowPatch( (0.10, 0.6), (0.50, 0.9), connectionstyle=’arc3,rad=0.1’, # Default mutation_scale=20 ) ax_arrow.add_patch(p) ax_arrow.text(0.3,0.30,’Transpose\nof Data Matrix’,fontsize=8) plt.show() 3 In [3]: ## Cluster Review Figure 1, Panel C ## Dimensionality and High-Dimensionality fig=plt.figure(figsize=(14,8)) ## Add Sparsity Axes ax1 = fig.add_axes([0 ,0,0.3,0.45], projection=’3d’) ax2 = fig.add_axes([0.33,0,0.3,0.45], projection=’3d’) ax3 = fig.add_axes([0.66,0,0.3,0.45], projection=’3d’) ## Sparsity Plots n = 5 size = 10 characteristics_array = [(’r’, ’o’, size, 0.1, 0.3, 0.1, 0.5, 0.4, 0.7), (’r’, ’o’, size, 0.1, 0 for c, m, s, xl, xh, yl, yh, zl, zh in characteristics_array: xs = randrange(n, xl, xh) ys = randrange(n, yl, yh) zs = randrange(n, zl, zh) y0s = randrange(n, 0, 0) z0s = randrange(n, 0.1, 0.1) ax1.scatter(xs, y0s, z0s, s=s, c=c, marker=m) ax2.scatter(xs, y0s, zs, s=s, c=c, marker=m) ax3.scatter(xs, ys, zs, s=s, c=c, marker=m) # make ticklabels and ticklines invisible 4 for axn in [ax1,ax2,ax3]: for a in axn.w_xaxis.get_ticklines()+axn.w_xaxis.get_ticklabels(): a.set_visible(False) for a in axn.w_yaxis.get_ticklines()+axn.w_yaxis.get_ticklabels(): a.set_visible(False) for a in axn.w_zaxis.get_ticklines()+axn.w_zaxis.get_ticklabels(): a.set_visible(False) if axn == ax1 or axn == ax2: axn.dist+=-3 axn.set_xlim(0,0.7) axn.set_ylim(0.3,0.8) axn.set_zlim(0,0.8) axn.elev=0 axn.azim=270 if axn == ax3: axn.dist+=-1 axn.set_xlim(0,0.5) axn.set_ylim(0.3,0.8) axn.set_zlim(0,0.8) plt.show() /Users/knaegle/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elemen if self. edgecolors == str(’face’): In [4]: ## Cluster Review Figure 1, Panel D ## Dimensionality fig=plt.figure(figsize=(7,4)) ## Add 3 sigma plot axis ax4 = fig.add_subplot(111) ## 3 Sigma Coverage Plot coverage = np.ones((2000,), dtype=np.int) * 0.997 plot_power = np.arange(1,2001) plot_data = np.power(coverage,plot_power) ax4.plot(plot_data) 5 ax4.set_xticklabels([0,500,1000,1500,2000],rotation=90,size=8) ax4.set_yticklabels([’0%’, ’20%’, ’40%’, ’60%’, ’80%’, ’100%’],size=8) ax4.set_xlabel(’Dimensionality’,size=10) ax4.set_ylabel(r’3$\sigma$ Coverage’,size=10) plt.show() In [5]: ##Cluster Review Figure 2, Panel A ## Dimensionality Reduction ## Toy Dimensionality Reduction np.random.seed(0) clust1_x = np.arange(-1,-0.1,0.05) clust1_y = np.zeros(len(clust1_x)) clust2_x = np.arange(0.1,1,0.05) clust2_y = np.arange(0.1,1,0.05) clust3_x = np.arange(0.1,0.7,0.03) clust3_y = -1 * np.arange(0.1,0.7,0.03) cluster1_data = np.asarray([clust1_x+0.05*np.random.randn(len(clust1_x)),0.05*np.random.randn(le cluster2_data = np.asarray([clust2_x+0.05*np.random.randn(len(clust2_x)),0.05*np.random.randn(le cluster3_data = np.asarray([clust3_x+0.05*np.random.randn(len(clust3_x)),0.05*np.random.randn(le 6 fig=plt.figure(figsize=(10,10)) # original data space ax = fig.add_subplot(111) ax.scatter(cluster1_data[0],cluster1_data[1],s=4,color=’darkseagreen’) ax.scatter(cluster2_data[0],cluster2_data[1],s=4,color=’maroon’) ax.scatter(cluster3_data[0],cluster3_data[1],s=4,color=’orange’) #ax.scatter(noisy_data[0],noisy_data[1],color=’k’) ax.set_xlim(-2,2) ax.set_ylim(-2,2) ax.set_xticklabels([]) ax.set_yticklabels([]) # PCA projection pca_data = np.hstack([cluster1_data,cluster2_data,cluster3_data]).T pca = sklearnPCA(n_components=1) pca_soln = pca.fit_transform(pca_data) # plot direction of highest variance ax.plot([0,pca.components_[0][0]*1.5],[0,pca.components_[0][1]*1.5],’--r’) ax.plot([0,-pca.components_[0][0]*1.5],[0,-pca.components_[0][1]*1.5],’--r’) ax.set_xlim(-2,2) ax.set_ylim(-2,2) plt.show() 7 In [6]: ##Cluster Review Figure 2, Panel B ## Dimensionality Reduction fig=plt.figure(figsize=(10,10)) # PCA dimensionality reduction ax = fig.add_subplot(111) ax.scatter(pca_soln[0:18], np.zeros(18), s=4, color=’darkseagreen’, alpha=1, label=’Cluster1’) ax.scatter(pca_soln[18:36], np.zeros(18), s=4,color=’maroon’, alpha=1, label=’Cluster2’) ax.scatter(pca_soln[36:54], np.zeros(18), s=4,color=’orange’, alpha=0.3, label=’Cluster3’) #ax.scatter(pca_soln[54:79], np.zeros(25), color=’k’, alpha=0.5, label=’Cluster4’) ax.set_xticklabels([]) ax.set_yticklabels([]) ax.plot([-1.5,1.5],[0,0],’--r’) 8 plt.show() In [7]: ##Cluster Review Figure 2, Panel C ## Dimensionality Reduction fig=plt.figure(figsize=(10,10)) # original data space # subspaces marked ax = fig.add_subplot(111) ax.scatter(cluster1_data[0],cluster1_data[1],s=4,color=’darkseagreen’) ax.scatter(cluster3_data[0],cluster3_data[1],s=4,color=’orange’) 9 ax.scatter(cluster2_data[0],cluster2_data[1],s=4,color=’maroon’) #ax.scatter(noisy_data[0],noisy_data[1],color=’k’) ax.set_xlim(-2,2) ax.set_ylim(-2,2) ax.set_xticklabels([]) ax.set_yticklabels([]) ax.plot([1.7,1],[1,1.7],’--r’) ax.plot([-1.4,-1.4],[-0.7,0.7],’--r’) ax.plot([0.4,1.6],[-1.6,-0.6],’--r’) plt.show() In [8]: ##Cluster Review Figure 2, Panel D ## Dimensionality Reduction 10 fig=plt.figure(figsize=(13,12)) ## Dimensionality Reduction applied to Lu (2005) mRNA GI clustering results #mRNA from Lu (2005) D = raw_mRNA89_filtered D_mc = D.sub(D.mean(axis=1),axis=0) D_norm = D_mc.div(D.std(axis=1),axis=0) D_norm.dropna(thresh=2, inplace=True) D_mRNA_column_labels = D.columns.tolist() D_mRNA = D_norm.transpose() #mRNA after applying PCA D = raw_mRNA89_filtered D_mc = D.sub(D.mean(axis=1),axis=0) D_norm = D_mc.div(D.std(axis=1),axis=0) D_norm.dropna(thresh=2, inplace=True) D_mRNApca_column_labels = D.columns.tolist() pca_model = sklearnPCA(n_components=10) pca_model.fit(D_norm.transpose()) D_mRNApca = pca_model.transform(D_norm.transpose()) #mRNA dim reduction (feature selection) D = raw_mRNA89_filtered D_mc = D.sub(D.mean(axis=1),axis=0) D_norm = D_mc.div(D.std(axis=1),axis=0) D_norm.dropna(thresh=2, inplace=True) D_mRNAdimred_column_labels = D.columns.tolist() mRNA_gi_dimension_list = [4914,7677,5373,3533,3786,9222,268,39,12017,9130] D_mRNAdimred = D_norm.transpose().iloc[:,mRNA_gi_dimension_list] # these are lists for the EP and GI tracks GI_list = [’_LVR_’,’_COLON_’,’_STOM_’,’_PAN_’] ### ### mRNA, as in Lu (2005) ### # Compute and plot dendrogram, clustering cell lines for mRNA ax_mRNA1 = fig.add_axes([0,0.1,1,.30]) lnk2 = sch.linkage(D_mRNA, method=’average’,metric=’correlation’) Z_cl = sch.dendrogram(lnk2,color_threshold=0) idx_cl = Z_cl[’leaves’] ax_mRNA1.axis(’off’) ax_mRNA1.set_title(’mRNA\n’) # Add color strip to indicate GI status ax_mRNA2 = fig.add_axes([0,0,1,0.10]) list_vals = [0 if any(cl in val for cl in GI_list) else 1 for val in D_mRNA_column_labels] unsorted_list = np.array(list_vals) sorted_list= unsorted_list[np.array(idx_cl)] 11 ax_mRNA2.matshow([sorted_list], aspect=’auto’, origin=’lower’, cmap = colors.ListedColormap([’bl ax_mRNA2.axis(’off’) plt.show() In [9]: ##Cluster Review Figure 2, Panels D, E, and F ## Dimensionality Reduction fig=plt.figure(figsize=(13,12)) ## Dimensionality Reduction applied to Lu (2005) mRNA GI clustering results ### ### mRNA after PCA ### # Compute and plot dendrogram, clustering cell lines for mRNA ax_D_mRNApca1 = fig.add_axes([0,0.1,1,0.30]) lnk2 = sch.linkage(D_mRNApca, method=’average’,metric=’correlation’) Z_cl = sch.dendrogram(lnk2,color_threshold=0) idx_cl = Z_cl[’leaves’] ax_D_mRNApca1.axis(’off’) ax_D_mRNApca1.set_title(’mRNA\n(PCA, 10 components)’) # Add color strip to indicate GI status ax_D_mRNApca2 = fig.add_axes([0,0,1,0.1]) list_vals = [0 if any(cl in val for cl in GI_list) else 1 for val in D_mRNApca_column_labels] unsorted_list = np.array(list_vals) sorted_list= unsorted_list[np.array(idx_cl)] ax_D_mRNApca2.matshow([sorted_list], aspect=’auto’, origin=’lower’, cmap = colors.ListedColormap ax_D_mRNApca2.axis(’off’) plt.show() 12 In [10]: ##Cluster Review Figure 2, Panels D, E, and F ## Dimensionality Reduction fig=plt.figure(figsize=(13,12)) ## Dimensionality Reduction applied to Lu (2005) mRNA GI clustering results ### ### mRNA after dimensionality reduction (feature selection) ### # Compute and plot dendrogram, clustering cell lines for mRNA ax_D_mRNAdimred1 = fig.add_axes([0,0.1,1,0.30]) lnk2 = sch.linkage(D_mRNAdimred, method=’average’,metric=’correlation’) Z_cl = sch.dendrogram(lnk2,color_threshold=0) idx_cl = Z_cl[’leaves’] ax_D_mRNAdimred1.axis(’off’) ax_D_mRNAdimred1.set_title(’mRNA\n(10 selected features)’) # Add color strip to indicate GI status ax_D_mRNAdimred2 = fig.add_axes([0,0,1,0.1]) list_vals = [0 if any(cl in val for cl in GI_list) else 1 for val in D_mRNAdimred_column_labels unsorted_list = np.array(list_vals) sorted_list= unsorted_list[np.array(idx_cl)] ax_D_mRNAdimred2.matshow([sorted_list], aspect=’auto’, origin=’lower’, cmap = colors.ListedColo ax_D_mRNAdimred2.axis(’off’) plt.show() 13 In [11]: ##Cluster Review Figure 3, Panels A and B ## Transformations and Distance Metrics ## Toy Transformation and Distance Metric Panel np.random.seed(0) # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times n_samples = 200 #create interesting data set np.random.seed(1) mean1 = [0.05,0.05] cov1 = [[0.0001,0],[0,0.0001]] x1,y1 = np.random.multivariate_normal(mean1,cov1,n_samples/2).T mean2 = [0.5,0.03] cov2 = [[0.0001,0],[0,0.0001]] x2,y2 = np.random.multivariate_normal(mean2,cov2,n_samples/2).T mean3 = [0.4,1] cov3 = [[0.02,0.015],[0.015,0.02]] x3,y3 = np.random.multivariate_normal(mean3,cov3,n_samples/2).T mean4 = [2,2] cov4 = [[0.005,0.003],[0.003,0.005]] x4,y4 = np.random.multivariate_normal(mean4,cov4,n_samples/2).T coordinates = np.transpose(np.vstack((np.hstack((x1,x2,x3,x4)), np.hstack((y1,y2,y3,y4))))) categories = np.hstack((np.zeros(len(x1)),np.ones(len(x2)),1+np.ones(len(x3)),2+np.ones(len(x4) example_data = (coordinates,categories) # dataset creation X,y = example_data X_raw = X X_log2 = np.log2(X) 14 X_exp = np.exp(X) X_Zscore = np.divide(X-np.mean(X),np.std(X)) X_range = np.divide(X-np.mean(X),np.max(X)-np.min(X)) X_vast = np.divide(X-np.mean(X),np.std(X))*np.divide(np.mean(X),np.std(X)) data_types = [X_raw, X_log2, X_exp] y_names = [’No transformation’,’Log base 2’,’Exponential’] # create clustering estimators agglom = cluster.AgglomerativeClustering(n_clusters=4, linkage=’average’) agglom_manhattan = cluster.AgglomerativeClustering(n_clusters=4, linkage=’average’,affinity="ma agglom_cosine = cluster.AgglomerativeClustering(n_clusters=4, linkage=’average’,affinity="cosin clustering_algorithms = [’gold’,agglom,agglom_manhattan,agglom_cosine] x_names = [’Actual Clusters’,’Euclidean’,’Manhattan’,’Cosine’] fig=plt.figure(figsize=(13,13)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,hspace=.01) # no transformation, no clustering clustcolors = np.array([’darkseagreen’,’dodgerblue’,’orange’,’khaki’]) ax = plt.subplot2grid((10,10), (0,1), colspan=2, rowspan=2) y_pred = np.asarray(y).astype(int) ax.set_title(’Reference \nData’, size=10) ax.set_ylabel(’No \nTransformation’) ax.scatter(X[:, 0], X[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.2,0.6,’A’, ax.text(0.1,0.2,’B’, ax.text(0.4,0.2,’C’, ax.text(0.9,0.7,’D’, ha=’center’, ha=’center’, ha=’center’, ha=’center’, va=’center’, va=’center’, va=’center’, va=’center’, transform=ax.transAxes, transform=ax.transAxes, transform=ax.transAxes, transform=ax.transAxes, fontsize=10) fontsize=10) fontsize=10) fontsize=10) # log base2, no clustering clustcolors = np.array([’darkseagreen’,’dodgerblue’,’orange’,’khaki’]) ax = plt.subplot2grid((10,10), (2,1), colspan=2, rowspan=2) y_pred = np.asarray(y).astype(int) ax.set_ylabel(’Log base 2’) ax.scatter(X_log2[:, 0], X_log2[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.4,0.9,’A’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.15,0.6,’B’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.7,0.2,’C’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.9,0.8,’D’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) 15 # no transformation Euclidean clustcolors = np.array([’orange’,’khaki’,’dodgerblue’,’darkseagreen’]) ax = plt.subplot(5,5,3) ax.set_ylabel(’No \nTransformation’) agglom.fit(X) if hasattr(agglom, ’labels_’): y_pred = agglom.labels_.astype(np.int) else: y_pred = agglom.predict(X) ax.set_title(’Euclidean’, size=10) ax.scatter(X[:, 0], X[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.2,0.6,’A’, ax.text(0.1,0.2,’B’, ax.text(0.4,0.2,’C’, ax.text(0.9,0.7,’D’, ha=’center’, ha=’center’, ha=’center’, ha=’center’, va=’center’, va=’center’, va=’center’, va=’center’, transform=ax.transAxes, transform=ax.transAxes, transform=ax.transAxes, transform=ax.transAxes, fontsize=10) fontsize=10) fontsize=10) fontsize=10) # log base2, Euclidean clustcolors = np.array([’khaki’,’plum’,’dodgerblue’,’darkseagreen’,’k’,’k’,’k’]) ax = plt.subplot(5,5,8) ax.set_ylabel(’Log base 2’) agglom.fit(X_log2) if hasattr(agglom, ’labels_’): y_pred = agglom.labels_.astype(np.int) else: y_pred = agglom.predict(X) ax.scatter(X_log2[:, 0], X_log2[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.4,0.9,’A’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.15,0.6,’B’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.7,0.2,’C’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.75,0.5,’C\’’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) #ax.text(0.9,0.8,’D’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) # no transformation Manhattan clustcolors = np.array([’darkseagreen’,’khaki’,’saddlebrown’,’orange’]) ax = plt.subplot(5,5,4) agglom_manhattan.fit(X) if hasattr(agglom_manhattan, ’labels_’): y_pred = agglom_manhattan.labels_.astype(np.int) else: y_pred = agglom_manhattan.predict(X) ax.set_title(’Manhattan’, size=10) 16 ax.scatter(X[:, 0], X[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.2,0.6,’A’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.55,0.55,’A\’’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.1,0.2,’B’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) #ax.text(0.4,0.2,’C’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.9,0.7,’D’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) # log base2, Manhattan clustcolors = np.array([’khaki’,’plum’,’darkseagreen’,’dodgerblue’]) ax = plt.subplot(5,5,9) agglom_manhattan.fit(X_log2) if hasattr(agglom_manhattan, ’labels_’): y_pred = agglom_manhattan.labels_.astype(np.int) else: y_pred = agglom_manhattan.predict(X) ax.scatter(X_log2[:, 0], X_log2[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.4,0.9,’A’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.15,0.6,’B’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.7,0.2,’C’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.75,0.5,’C\’’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) #ax.text(0.9,0.8,’D’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) # no transformation Cosine clustcolors = np.array([’darkseagreen’,’dodgerblue’,’orange’,’khaki’]) ax = plt.subplot(5,5,5) agglom_cosine.fit(X) if hasattr(agglom_cosine, ’labels_’): y_pred = agglom_cosine.labels_.astype(np.int) else: y_pred = agglom_cosine.predict(X) y_pred = np.asarray(y).astype(int) ax.set_title(’Cosine’, size=10) ax.scatter(X[:, 0], X[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.2,0.6,’A’, ax.text(0.1,0.2,’B’, ax.text(0.4,0.2,’C’, ax.text(0.9,0.7,’D’, ha=’center’, ha=’center’, ha=’center’, ha=’center’, va=’center’, va=’center’, va=’center’, va=’center’, transform=ax.transAxes, transform=ax.transAxes, transform=ax.transAxes, transform=ax.transAxes, # log base2, Cosine clustcolors = np.array([’orange’,’dodgerblue’,’darkseagreen’,’khaki’]) ax = plt.subplot(5,5,10) agglom_cosine.fit(X_log2) if hasattr(agglom_cosine, ’labels_’): 17 fontsize=10) fontsize=10) fontsize=10) fontsize=10) y_pred = agglom_cosine.labels_.astype(np.int) else: y_pred = agglom_cosine.predict(X) ax.scatter(X_log2[:, 0], X_log2[:, 1], color=clustcolors[y_pred].tolist(), s=2) ax.set_xticks(()) ax.set_yticks(()) ax.text(0.4,0.9,’A’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.15,0.6,’B’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.7,0.2,’C’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ax.text(0.9,0.8,’D’, ha=’center’, va=’center’, transform=ax.transAxes, fontsize=10) ####### ###endplot ####### fig.text(0.05,1,’A’,fontsize=20) fig.text(.33,1,’B’,fontsize=20) plt.show() In [12]: ##Cluster Review Figure 3, Panel C ## Transformations and Distance Metrics fig=plt.figure(figsize=(13,13)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,hspace=.01) ## Differential Clustering of GI cell lines from Lu (2005) based on Transformation #miRNA from Lu (2005) D = raw_miRNA89_filtered D_mc = D.sub(D.mean(axis=1),axis=0) 18 D_norm = D_mc.div(D.std(axis=1),axis=0) D_miRNA_column_labels = D.columns.tolist() D_miRNA = D_norm.transpose() #miRNA before log2 transformation D = 2 ** raw_miRNA89_filtered D_mc = D.sub(D.mean(axis=1),axis=0) D_norm = D_mc.div(D.std(axis=1),axis=0) D_norm.dropna(thresh=2, inplace=True) D_miRNAprelog_column_labels = D.columns.tolist() D_miRNAprelog = D_norm.transpose() # these are lists for the EP and GI tracks GI_list = [’_LVR_’,’_COLON_’,’_STOM_’,’_PAN_’] ### ### miRNA, as in Lu (2005) ### # Compute and plot dendrogram, clustering cell lines for miRNA ax_miRNA1 = fig.add_axes([0,.1,1,.30]) lnk1 = sch.linkage(D_miRNA, method=’average’,metric=’correlation’) Z_cl = sch.dendrogram(lnk1,color_threshold=0) idx_cl = Z_cl[’leaves’] ax_miRNA1.axis(’off’) ax_miRNA1.set_title(’miRNA\n’) # Add color strip to indicate GI status ax_miRNA2 = fig.add_axes([0,0,1,0.1]) list_vals = [0 if any(cl in val for cl in GI_list) else 1 for val in D_miRNA_column_labels] unsorted_list = np.array(list_vals) sorted_list= unsorted_list[np.array(idx_cl)] ax_miRNA2.matshow([sorted_list], aspect=’auto’, origin=’lower’, cmap = colors.ListedColormap([’ ax_miRNA2.axis(’off’) plt.show() 19 In [13]: ##Cluster Review Figure 3, Panel D ## Transformations and Distance Metrics fig=plt.figure(figsize=(13,13)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,hspace=.01) ### ### miRNA before log transformation ### # Compute and plot dendrogram, clustering cell lines for mRNA ax_miRNA_prelog1 = fig.add_axes([0,.1,1,.30]) lnk2 = sch.linkage(D_miRNAprelog, method=’average’,metric=’correlation’) Z_cl = sch.dendrogram(lnk2,color_threshold=0) idx_cl = Z_cl[’leaves’] ax_miRNA_prelog1.axis(’off’) ax_miRNA_prelog1.set_title(’miRNA\n(before log_2 transformation)’) # Add color strip to indicate GI status ax_miRNA_prelog2 = fig.add_axes([0,0,1,0.1]) list_vals = [0 if any(cl in val for cl in GI_list) else 1 for val in D_miRNAprelog_column_label unsorted_list = np.array(list_vals) sorted_list= unsorted_list[np.array(idx_cl)] ax_miRNA_prelog2.matshow([sorted_list], aspect=’auto’, origin=’lower’, cmap = colors.ListedColo ax_miRNA_prelog2.axis(’off’) plt.show() 20 In [14]: ##Cluster Review Figure 4 ## Algorithms # adapted from Scikit Learn, "Comparing different clustering algorithms on toy datasets" # found at http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html np.random.seed(0) ## Generate datasets. n_samples = 800 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.3,noise=.07) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=10,centers=3,cluster_std=1) #create noisy parallel lines mean1 = [0,1] cov1 = [[5,0],[0,0.1]] x1,y1 = np.random.multivariate_normal(mean1,cov1,n_samples/2).T mean2 = [0,-1] cov2 = [[5,0],[0,.1]] x2,y2 = np.random.multivariate_normal(mean2,cov2,n_samples/2).T coordinates = np.transpose(np.vstack((np.hstack((x1,x2)), np.hstack((y1,y2))))) categories = np.hstack((np.zeros(len(x1)),np.ones(len(y1)))) noisy_lines = (coordinates,categories) #create data with no structure no_structure = np.random.rand(n_samples, 2), None clustcolors = np.array([’darkseagreen’,’dodgerblue’,’orange’,’khaki’,’darkred’,’b’,’g’,’r’,’c’, clustcolors = np.hstack([clustcolors] * 20) clustering_names = [’K-Means’, ’Ward’, ’DBSCAN’, ’Mixture Models’] fig=plt.figure(figsize=(13, 13)) 21 plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 dataset_list = [no_structure, blobs, noisy_lines, noisy_moons, noisy_circles] for i_dataset, dataset in enumerate(dataset_list): X, y = dataset # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # create clustering estimators two_means = cluster.MiniBatchKMeans(n_clusters=2) three_means = cluster.MiniBatchKMeans(n_clusters=3) ward_two = cluster.AgglomerativeClustering(n_clusters=2, linkage=’ward’,connectivity=connec ward_three = cluster.AgglomerativeClustering(n_clusters=3, linkage=’ward’,connectivity=conn dbscan = cluster.DBSCAN(eps=.3) #mixture model results gmm2 = mixture.GMM(n_components=2, covariance_type=’full’) gmm3 = mixture.GMM(n_components=3, covariance_type=’full’) #clustering_algorithms = [two_means, affinity_propagation, ms, spectral, ward, average_link clustering_algorithms = [’kmeans’,’ward’,’dbscan’,’mixturemodels’] for name, algorithm_name in zip(clustering_names, clustering_algorithms): # predict cluster memberships if dataset is no_structure or dataset is noisy_lines or dataset is noisy_moons or datas if algorithm_name == ’kmeans’: algorithm = two_means if algorithm_name == ’ward’: algorithm = ward_two if algorithm_name == ’dbscan’: algorithm = dbscan if algorithm_name == ’mixturemodels’: algorithm = gmm2 if dataset is blobs: if algorithm_name == ’kmeans’: algorithm = three_means if algorithm_name == ’ward’: algorithm = ward_two if algorithm_name == ’dbscan’: algorithm = dbscan if algorithm_name == ’mixturemodels’: algorithm = gmm3 22 algorithm.fit(X) if hasattr(algorithm, ’labels_’): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # plot ax = plt.subplot(len(dataset_list), len(clustering_algorithms), plot_num) if i_dataset == 0: ax.set_title(name, size=11) if plot_num == 1: ax.set_ylabel(’No\nStructure’) if plot_num == 5: ax.set_ylabel(’Three\nClusters’) if plot_num == 9: ax.set_ylabel(’Two\nWide Clusters’) if plot_num == 13: ax.set_ylabel(’Two\nHalf Moons’) if plot_num == 17: ax.set_ylabel(’Two\nNested Circles’) ax.scatter(X[:, 0], X[:, 1], color=clustcolors[y_pred].tolist(), s=1) ax.set_xticks(()) ax.set_yticks(()) ax.axis("equal") plot_num += 1 plt.show() /Users/knaegle/anaconda/lib/python2.7/site-packages/sklearn/cluster/hierarchical.py:205: UserWarning: th connectivity, n components = fix connectivity(X, connectivity) 23 In [15]: ##Cluster Review Figure 5, Panel A ## Ensemble Clustering Toy Example np.random.seed(0) n_samples = 300 blobs = datasets.make_blobs(n_samples=n_samples, random_state=10,centers=5,cluster_std=2) clustcolors = np.array([’darkseagreen’,’dodgerblue’,’darkred’,’b’,’orange’,’lightcoral’,’g’,’r’ clustcolors = np.hstack([clustcolors] * 200) clustering_names = [’k=2’,’k=3’,’k=4’,’k=5’,’k=6’,’k=7’,’k=8’,’k=9’,’k=10’] fig=plt.figure(figsize=(10,10)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.1, hspace=.1) 24 X, y = blobs # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # create clustering estimators kmeans2 = cluster.MiniBatchKMeans(n_clusters=2) kmeans3 = cluster.MiniBatchKMeans(n_clusters=3) kmeans4 = cluster.MiniBatchKMeans(n_clusters=4) kmeans5 = cluster.MiniBatchKMeans(n_clusters=5) kmeans6 = cluster.MiniBatchKMeans(n_clusters=6) kmeans7 = cluster.MiniBatchKMeans(n_clusters=7) kmeans8 = cluster.MiniBatchKMeans(n_clusters=8) kmeans9 = cluster.MiniBatchKMeans(n_clusters=9) kmeans10 = cluster.MiniBatchKMeans(n_clusters=10) clustering_algorithms = [kmeans2,kmeans3,kmeans4,kmeans5,kmeans6,kmeans7,kmeans8,kmeans9,kmeans plot_num=0 for name, algorithm in zip(clustering_names, clustering_algorithms): # predict cluster memberships #t0 = time.time() algorithm.fit(X) #t1 = time.time() if hasattr(algorithm, ’labels_’): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # plot plt.subplot(3, 3, plot_num) plt.scatter(X[:, 0], X[:, 1], color=clustcolors[y_pred].tolist(), s=2) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.axis("equal") plt.text(.95, .84, (name),transform=plt.gca().transAxes, size=10,horizontalalignment=’right plot_num+=1 ############################################# plt.show() /Users/knaegle/anaconda/lib/python2.7/site-packages/matplotlib/axes/ subplots.py:69: MatplotlibDeprecati mplDeprecation) 25 In [16]: ##Cluster Review Figure 5, Panel B ## Ensemble Clustering Toy Example fig=plt.figure(figsize=(10,10)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.1, hspace=.1) ############################################################## #co-occurrence matrix ############################################################## dend_colors=[’darkseagreen’,’dodgerblue’,’darkred’,’b’,’orange’,’gray’,’g’,’r’,’c’,’m’,’y’] sch.set_link_color_palette(dend_colors) dim=n_samples 26 co_matrix = np.zeros(shape=(dim,dim)) for algorithm in clustering_algorithms: algorithm.fit(X) clustering_solution = algorithm.predict(X) clusterid_list = np.unique(clustering_solution) #print clusterid_list for clusterid in clusterid_list: itemindex = np.where(clustering_solution==clusterid) #print itemindex for i,x in enumerate(itemindex[0][0:-2]): for j,y in enumerate(itemindex[0][i+1:]): #print i,j,x,y co_matrix[x,y]+=1 co_matrix[y,x]+=1 #D=ssd.squareform(co_matrix) D=co_matrix dendrogram_distance = 35 # Compute and plot first dendrogram. #fig = pylab.figure(figsize=(8,8)) ax1 = fig.add_axes([0,0,0.09,0.80]) Y = sch.linkage(D, method=’average’) Z1 = sch.dendrogram(Y, orientation=’right’, color_threshold=dendrogram_distance) ax1.set_xticks([]) ax1.set_yticks([]) fig.gca().invert_yaxis() # this plus the y-axis invert in the heatmap flips the y-axis heatmap ax1.axis(’off’) # Compute second dendrogram. Y = sch.linkage(D, method=’average’) Z2 = sch.dendrogram(Y, color_threshold=dendrogram_distance, no_plot=True) # Plot distance matrix. axmatrix = fig.add_axes([0.10,0,0.80,0.80]) idx1 = Z1[’leaves’] idx2 = Z2[’leaves’] sorted_co_matrix = co_matrix[idx1,:] sorted_co_matrix = sorted_co_matrix[:,idx2] im = axmatrix.matshow(sorted_co_matrix/np.amax(sorted_co_matrix), aspect=’auto’, origin=’lower’ axmatrix.set_xticks([]) axmatrix.set_yticks([]) fig.gca().invert_yaxis() # this plus the x-axis invert in the right-flipped dendrogram flips th # Plot colorbar. axcolor = fig.add_axes([0.96,0,0.02,0.80]) cbar=pylab.colorbar(im, cax=axcolor) axcolor.tick_params(labelsize=10) axcolor.set_yticklabels([’0%’,’10%’,’20%’,’30%’,’40%’,’50%’,’60%’,’70%’,’80%’,’90%’,’100%’,]) 27 plt.show() In [17]: ##Cluster Review Figure 5, Panel C ## Ensemble Clustering Toy Example fig=plt.figure(figsize=(10,10)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.1, hspace=.1) ############################################################## #thresholded co-occurrence matrix ############################################################## #D=ssd.squareform(co_matrix) D=co_matrix dendrogram_distance = 35 # Compute and plot first dendrogram. ax3 = fig.add_axes([0,0,0.08,0.40]) Y = sch.linkage(D, method=’average’) Z1 = sch.dendrogram(Y, orientation=’right’, color_threshold=dendrogram_distance) ax3.set_xticks([]) 28 ax3.set_yticks([]) fig.gca().invert_yaxis() # this plus the y-axis invert in the heatmap flips the y-axis heatmap ax3.axis(’off’) # Compute second dendrogram. Y = sch.linkage(D, method=’average’) Z2 = sch.dendrogram(Y, color_threshold=dendrogram_distance, no_plot=True) # Plot distance matrix. axmatrix2 = fig.add_axes([0.10,0,0.40,0.40]) idx1 = Z1[’leaves’] idx2 = Z2[’leaves’] sorted_co_matrix = co_matrix[idx1,:] sorted_co_matrix = sorted_co_matrix[:,idx2] im2 = axmatrix2.matshow(sorted_co_matrix/np.amax(sorted_co_matrix), aspect=’auto’, origin=’lowe axmatrix2.set_xticks([]) axmatrix2.set_yticks([]) fig.gca().invert_yaxis() # this plus the x-axis invert in the right-flipped dendrogram flips th ############################################################## #ensemble result ############################################################## ind = sch.fcluster(Y, dendrogram_distance, ’distance’) axensemble = fig.add_axes([0.55,0,0.4,0.4]) plt.scatter(X[:, 0], X[:, 1], color=np.asarray(dend_colors)[ind-1].tolist(), s=5) #plt.title("Ensemble Result", size=12) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.axis("equal") plt.show() 29 In [18]: ##Cluster Review Figure 5, Panel D ## Ensemble Clustering Toy Example fig=plt.figure(figsize=(10,10)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.1, hspace=.1) ############################################# #zoomed co-occ matrix ############################################# dend_colors=[’orange’,’gray’,’b’,’orange’,’k’,’k’,’k’] sch.set_link_color_palette(dend_colors) khaki_items= ind==4 orange_items = ind==5 blue_items = ind==6 interesting_items = khaki_items + orange_items + blue_items D2=co_matrix[interesting_items,:] D2=D2[:,interesting_items] #dendrogram_distance = 35 # Compute and plot first dendrogram. ax5 = fig.add_axes([0,0,0.08,0.40]) Y = sch.linkage(D2, method=’average’) Z1 = sch.dendrogram(Y, orientation=’right’, color_threshold=dendrogram_distance) ax5.set_xticks([]) ax5.set_yticks([]) fig.gca().invert_yaxis() # this plus the y-axis invert in the heatmap flips the y-axis heatmap ax5.axis(’off’) # Compute second dendrogram. Y = sch.linkage(D2, method=’average’) Z2 = sch.dendrogram(Y, color_threshold=dendrogram_distance,no_plot=True) # Plot distance matrix. axmatrix3 = fig.add_axes([0.10,0,0.40,0.40]) idx1 = Z1[’leaves’] idx2 = Z2[’leaves’] sorted_co_matrix = D2[idx1,:] sorted_co_matrix = sorted_co_matrix[:,idx2] im3 = axmatrix3.matshow(sorted_co_matrix/np.amax(sorted_co_matrix), aspect=’auto’, origin=’lowe axmatrix3.set_xticks([]) axmatrix3.set_yticks([]) fig.gca().invert_yaxis() # this plus the x-axis invert in the right-flipped dendrogram flips th #axmatrix3.set_title(’Zoom’) #plt.text(.95, .90, (’50% Threshold’),transform=plt.gca().transAxes, size=12,horizontalalignmen #Plot colorbar. #axcolor3 = fig.add_axes([.75,0.2,0.01,0.20]) 30 #cbar=pylab.colorbar(im3, cax=axcolor3) ############################################# #partially fuzzy result ############################################# axpf = fig.add_axes([0.55,0,0.4,0.4]) axpf.set_xticks([]) axpf.set_yticks([]) dend_colors=[’white’,’white’,’white’,’b’,’orange’,’gray’,’g’,’r’,’c’,’m’,’y’] sch.set_link_color_palette(dend_colors) orange_centroid=np.asarray([-1.3,-0.9]) blue_centroid=np.asarray([0.55,-0.4]) plt.scatter(X[:, 0], X[:, 1], color=np.asarray(dend_colors)[ind-1].tolist(), s=5) plt.scatter(orange_centroid[0],orange_centroid[1],marker=’D’,edgecolor = ’k’,color=’darkorange’ plt.scatter(blue_centroid[0],blue_centroid[1],marker=’D’,edgecolor = ’k’,color=’dodgerblue’,s=6 # equidistant line from centroids plt.plot([-1, 0.5],[0.7,-2.2],’--r’) #plt.title("Partially Fuzzy Result", size=12) plt.xlim(-2.3, 1.2) plt.ylim(-2.3, 0.8) plt.xticks(()) plt.yticks(()) plt.show() In [19]: ##Cluster Review Figure 6, Panel A ## Ensemble Clustering Example plt.rcParams[’lines.linewidth’] = 1 dend_colors=[’darkseagreen’,’dodgerblue’,’darkred’,’b’,’orange’,’gray’,’g’,’r’,’c’,’m’,’y’] 31 sch.set_link_color_palette(dend_colors) D = raw_phosprot row_labels = D.index.get_level_values(’gene_site’) fig = pylab.figure(figsize=(5,10)) panel1 = fig.add_axes([0,0,1,1]) panel1.axis(’off’) #panel1.set_title(’Single\nClustering Solution’,y=1.05) ## panel 1 cluster_list = [’EGFR_Y1172’, ’EGFR_Y1197’, ’GAB1_Y659’, ’GAB1_Y627’, ’SHC1_Y427’, ’SHC1_Y349_Y # Compute and plot left dendrogram, clustering phospho-dynamics ax1 = add_subplot_axes(panel1,[0.0,0.1,0.28,0.9]) lnk1 = sch.linkage(D, method=’ward’,metric=’euclidean’) Z_pp = sch.dendrogram(lnk1, orientation=’right’,color_threshold=3) idx_pp = Z_pp[’leaves’] fig.gca().invert_yaxis() # must couple with matshow origin=’upper’, below, to match Lu(2005) Fi ax1.set_xticks([]) for side in [’top’,’right’,’bottom’,’left’]: ax1.spines[side].set_visible(False) ax1.axis(’off’) # plot heatmap axmatrix = add_subplot_axes(panel1,[0.56,0.1,0.44,0.9]) hm = D hm = hm.ix[idx_pp,:] im = axmatrix.matshow(hm, aspect=’auto’, origin=’upper’, cmap=’Blues’, vmin = 0, vmax = 3) #axmatrix.axis(’off’) for side in [’top’,’right’,’bottom’,’left’]: axmatrix.spines[side].set_visible(False) axmatrix.set_xticks([]) axmatrix.set_xticklabels([]) axmatrix.set_yticks([]) # Add color strip to indicate MAL type (Normal, Tumor or TCL) ax2 = add_subplot_axes(panel1,[0.32,0.1,0.20,0.9]) list_vals = [0 if any(pp in val for pp in cluster_list) else 1 for val in row_labels] unsorted_list = np.array(list_vals) unsorted_list[row_labels == ’PDLIM1_Y321’]=2 sorted_list= unsorted_list[np.array(idx_pp)] ax2.matshow(sorted_list[None].T, aspect=’auto’, origin=’upper’, cmap = colors.ListedColormap([’ ax2.set_xticks([]) ax2.set_yticks([]) ax2.axis(’off’) plt.show() 32 33 In [20]: ##Cluster Review Figure 6, Panel B ## Ensemble Clustering Example fig = pylab.figure(figsize=(10,10)) panel3 = fig.add_axes([0,0,1,1]) panel3.axis(’off’) ## Panel 3 D = raw_phosprot row_labels = D.index.get_level_values(’gene_site’) cluster_list = [’EGFR_Y1172’, ’EGFR_Y1197’, ’GAB1_T659’, ’GAB1_Y627’, ’SHC_Y427’, ’SHC_Y349_Y35 dist_metrics = [’euclidean’, ’correlation’, ’cityblock’, ’cosine’, ’braycurtis’, ’canberra’, ’c bool_dist_metrics = [’dice’, ’jaccard’, ’kulsinski’, ’matching’, ’rogerstanimoto’, ’russellrao’ lnk_methods = [’single’, ’complete’, ’average’, ’weighted’, ’median’, ’centroid’, ’ward’] final_clust_soln = np.zeros([len(raw_phosprot),len(raw_phosprot)]) for dist_metric in dist_metrics: for lnk_method in lnk_methods: if (lnk_method == ’ward’ or lnk_method == ’centroid’ or lnk_method == ’median’) and dis continue else: lnk1 = sch.linkage(D, method=lnk_method, metric = dist_metric) ## define clusters here k=14 cluster_soln = [dist_metric, lnk_method,fcluster(lnk1, k, criterion=’maxclust’)] bin_clust_soln = np.zeros((max(cluster_soln[2]),len(cluster_soln[2]))) for i,entry in enumerate(cluster_soln[2]): bin_clust_soln[entry-1,i] = 1 ## assigns 1 to category column, corrected for zero-i coocc_single = bin_clust_soln.T.dot(bin_clust_soln) final_clust_soln = final_clust_soln + coocc_single final_clust_soln_df = pd.DataFrame(final_clust_soln.astype(int)) # these are separate, not in creation clause, due to super odd floating point errors final_clust_soln_df.index = row_labels final_clust_soln_df.columns = row_labels D = final_clust_soln_df row_labels = D.index.get_level_values(’gene_site’) 34 cluster_list = [’EGFR_Y1172’, ’EGFR_Y1197’, ’GAB1_Y659’, ’GAB1_Y627’, ’SHC1_Y427’, ’SHC1_Y349_Y # Compute and plot left dendrogram ax1 = add_subplot_axes(panel3,[0.0,0.3,0.10,.6]) lnk1 = sch.linkage(D, method=’ward’,metric=’euclidean’) Z_pp = sch.dendrogram(lnk1, orientation=’right’) idx_pp = Z_pp[’leaves’] #ax1.set_yticklabels(row_labels[idx_pp],size=3) ax1.set_yticks([]) fig.gca().invert_yaxis() # must couple with matshow origin=’upper’, below, to match Lu(2005) Fi ax1.set_xticks([]) for side in [’top’,’right’,’bottom’,’left’]: ax1.spines[side].set_visible(False) #ax1.axis(’off’) # plot heatmap axmatrix = add_subplot_axes(panel3,[0.28,0.3,0.7,.6]) hm = D.divide(35) hm = hm.ix[idx_pp,idx_pp] im = axmatrix.matshow(hm, aspect=’auto’, origin=’upper’, cmap=’afmhot’) axmatrix.axis(’off’) # Add color strip to indicate PDLIM1 cluster presence ax2 = add_subplot_axes(panel3,[0.13,0.3,0.13,0.6]) list_vals = [0 if any(pp in val for pp in cluster_list) else 1 for val in row_labels] unsorted_list = np.array(list_vals) unsorted_list[row_labels == ’PDLIM1_Y321’]=2 sorted_list= unsorted_list[np.array(idx_pp)] ax2.matshow(sorted_list[None].T, aspect=’auto’, origin=’upper’, cmap = colors.ListedColormap([’ ax2.set_xticks([]) ax2.set_yticks([]) ax2.axis(’off’) # Plot colorbar indicating scale axcolor = add_subplot_axes(panel3,[0.28,0.2,0.7,.02]) # [xmin, ymin, dx, and dy] h=pylab.colorbar(im, cax=axcolor,orientation=’horizontal’) h.ax.tick_params(labelsize=10) h.set_ticks([0,.25,.50,.75,1]) h.set_ticklabels([’0%’,’25%’,’50%’,’75%’,’100%’]) plt.show() 35 In [21]: ##Cluster Review Figure 6, Panel C ## Ensemble Clustering Example fig = pylab.figure(figsize=(10,10)) ## panel 2 D = raw_phosprot row_labels = D.index.get_level_values(’gene_site’) cluster_list = [’EGFR_Y1172’, ’EGFR_Y1197’, ’GAB1_Y659’, ’GAB1_Y627’, ’SHC1_Y427’, ’SHC1_Y349_Y dist_metrics = [’euclidean’, ’correlation’, ’cityblock’, ’cosine’] lnk_methods = [’single’, ’complete’, ’average’] plotnum = 1 36 for dist_metric in dist_metrics: for lnk_method in lnk_methods: #make subplot panel2 = fig.add_subplot(len(dist_metrics),len(lnk_methods),plotnum) panel2.axis(’off’) # Add dendrogram axis subpos = [0.0,0.22,1,0.78] subax1 = add_subplot_axes(panel2,subpos) lnk1 = sch.linkage(D, method=lnk_method, metric = dist_metric) Z = sch.dendrogram(lnk1,color_threshold = 0.15*max(lnk1[:,2])) idx_leaves = Z[’leaves’] subax1.set_xticks([]) subax1.set_yticks([]) subax1.spines[’top’].set_visible(False) subax1.spines[’right’].set_visible(False) subax1.spines[’bottom’].set_visible(False) subax1.spines[’left’].set_visible(False) if plotnum in [1,2,3]: subax1.set_title(lnk_method.title(),size=12) if plotnum in [1,4,7,10]: subax1.set_ylabel(dist_metric.title(),size=12) # Add color strip axis subpos = [0,0,1,0.2] subax2 = add_subplot_axes(panel2,subpos) list_vals = [0 if any(pp in val for pp in cluster_list) else 1 for val in row_labels] unsorted_list = np.array(list_vals) unsorted_list[row_labels == ’PDLIM1_Y321’]=2 sorted_list= unsorted_list[np.array(idx_leaves)] subax2.matshow([sorted_list], aspect=’auto’, origin=’lower’, cmap = colors.ListedColorm subax2.set_xticks([]) subax2.set_yticks([]) subax2.axis(’off’) plotnum+=1 plt.show() 37 In [ ]: 38
© Copyright 2026 Paperzz