Heatmap of a huge dataset

Question

I have a tab delimited file containing regions and the respective biological entities found in these regions (I have checked for 67, hence you say each region was checked for the presence or absence of these 67 entities and their frequency).

I have all this data in a tabular format.

A sample data is given below

Region  ATF3    BCL3    BCLAF1  BDP1    BRF1    BRF2    Brg1    CCNT2   CEBPB   CHD2    CTCF    CTCFL   E2F6    ELF1
chr1:109102470:109102970    0   0   1   0   0   0   0   1   0   0   4   1   4   1
chr1:110526886:110527386    0   0   0   0   0   0   0   1   1   0   4   1   0   1
chr1:115300671:115301171    0   0   1   0   0   0   0   0   1   1   4   1   1   1
chr1:115323308:115323808    0   0   0   0   0   0   0   1   0   0   2   1   1   0
chr1:11795641:11796141  1   0   0   0   0   0   0   1   2   0   0   0   1   0
chr1:118148103:118148603    0   0   0   0   0   0   0   1   0   0   0   0   0   1
chr1:150521397:150521897    0   0   0   0   0   0   0   2   2   0   6   2   4   0
chr1:150601609:150602109    0   0   0   0   0   0   0   0   3   2   0   0   1   0
chr1:150602098:150602598    0   0   0   0   0   0   0   0   1   1   0   0   0   0
chr1:151119140:151119640    0   0   0   0   0   0   0   1   0   0   0   0   1   0
chr1:151128604:151129104    0   0   0   0   0   0   0   0   0   0   3   0   0   0
chr1:153517729:153518229    0   0   0   0   0   0   0   0   0   0   0   0   0   0
chr1:153962738:153963238    0   0   0   0   0   0   0   1   1   0   0   0   0   1
chr1:154155682:154156182    0   0   0   0   0   0   0   1   0   0   0   0   1   1
chr1:154155725:154156225    0   0   0   0   0   0   0   1   0   0   0   0   1   1
chr1:154192154:154192654    0   0   0   0   0   0   0   0   0   0   0   0   0   0
chr1:154192824:154193324    1   0   0   0   0   0   0   1   0   1   0   0   1   1
chr1:154192943:154193443    1   0   0   0   0   0   0   1   0   2   0   0   1   1
chr1:154193273:154193773    1   0   0   0   0   0   0   1   0   2   0   0   2   1
chr1:154193313:154193813    0   0   0   0   0   0   0   1   0   2   0   0   2   1
chr1:155904188:155904688    0   0   0   0   0   0   0   1   0   0   0   0   1   1
chr1:155947966:155948466    0   0   0   0   0   0   0   1   0   0   3   0   0   1
chr1:155948336:155948836    0   0   0   0   0   0   0   1   0   0   5   1   0   1
chr1:156023516:156024016    0   0   0   0   0   0   0   1   0   1   4   1   1   1
chr1:156024016:156024516    0   1   1   0   0   0   0   0   0   2   0   0   1   1
chr1:156163229:156163729    0   0   0   0   0   0   0   0   0   0   2   0   0   1
chr1:160990902:160991402    0   0   0   0   0   0   0   0   0   1   0   0   1   2
chr1:160991133:160991633    0   0   0   0   0   0   0   0   0   1   0   0   1   2
chr1:161474704:161475204    0   0   0   0   0   0   0   0   0   0   0   0   0   0
chr1:161509530:161510030    0   0   1   1   1   0   0   0   1   0   1   0   0   1
chr1:161590964:161591464    0   0   0   1   1   0   0   0   0   0   0   0   0   0
chr1:169075446:169075946    0   0   0   0   0   0   0   2   0   0   4   0   3   0
chr1:17053279:17053779  0   0   0   1   0   0   0   0   0   1   0   0   0   0
chr1:1709909:1710409    0   0   0   0   0   0   0   2   0   1   0   0   3   1
chr1:1710297:1710797    0   0   0   0   0   0   0   0   0   1   6   0   1   1

Now how can I put this in a heat map from colour code light red to dark red ( depending upon the frequency and white in case of absence)?

Is there any other better way to represent this type of data?

Thorsten Kranz · Accepted Answer

Due to the comments to my other answer OP had another question regarding the search of 2d clusters. Here is some answer.

Taken from my library eegpy, I use a method find_clusters. It performs a walk across a 2d-array, finding all clusters above / below a given threshold.

Here is my code:

import pylab as plt
import numpy as np
from Queue import Queue


def find_clusters(ar,thres,cmp_type="greater"):
    """For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
    if not cmp_type in ["lower","greater","abs_greater"]:
        raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
    clusters = []
    if cmp_type=="lower":
        ar_in = (ar<thres).astype(np.bool)
    elif cmp_type=="greater":
        ar_in = (ar>thres).astype(np.bool)
    else: #cmp_type=="abs_greater":
        ar_in = (abs(ar)>thres).astype(np.bool)

    already_visited = np.zeros(ar_in.shape,np.bool)
    for i_s in range(ar_in.shape[0]): #i_s wie i_sample
        for i_f in range(ar_in.shape[1]):
            if not already_visited[i_s,i_f]:
                if ar_in[i_s,i_f]:
                    #print "Anzahl cluster:", len(clusters)
                    mask = np.zeros(ar_in.shape,np.bool)
                    check_queue = Queue()
                    check_queue.put((i_s,i_f))
                    while not check_queue.empty():
                        pos_x,pos_y = check_queue.get()
                        if not already_visited[pos_x,pos_y]:
                            #print pos_x,pos_y
                            already_visited[pos_x,pos_y] = True
                            if ar_in[pos_x,pos_y]:
                                mask[pos_x,pos_y] = True
                                for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
                                    if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
                                        check_queue.put(coords)
                    clusters.append(mask)
    return clusters

fn = "14318737.txt"
with open(fn, "r") as f:
    labels = f.readline().rstrip("
").split()[1:]
data = np.loadtxt(fn, skiprows=1, converters={0:lambda x: 0})

clusters = find_clusters(data, 0, "greater")

plot_data = np.ma.masked_equal(data[:,1:], 0)

plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto", 
           vmin=0, extent=[0.5,plot_data.shape[1]+0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()

for cl in clusters:
    plt.contour(cl.astype(np.int),[0.5], colors="k", lw=2)
plt.xticks(np.arange(1, len(labels)+2), labels, rotation=90, va="top", ha="center")


plt.show()

which gives an image of the form:

Plot with contour around clusters

clusters is a list of boolean 2d-arrays (True / False). Each arrray represents one cluster, where each boolean value indicates whether a specific "point" is part of this cluster. You can use it in any further analysis.

EDIT

Now with some more fun on the clusters

import pylab as plt
import numpy as np
from Queue import Queue


def find_clusters(ar,thres,cmp_type="greater"):
    """For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
    if not cmp_type in ["lower","greater","abs_greater"]:
        raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
    clusters = []
    if cmp_type=="lower":
        ar_in = (ar<thres).astype(np.bool)
    elif cmp_type=="greater":
        ar_in = (ar>thres).astype(np.bool)
    else: #cmp_type=="abs_greater":
        ar_in = (abs(ar)>thres).astype(np.bool)

    already_visited = np.zeros(ar_in.shape,np.bool)
    for i_s in range(ar_in.shape[0]): #i_s wie i_sample
        for i_f in range(ar_in.shape[1]):
            if not already_visited[i_s,i_f]:
                if ar_in[i_s,i_f]:
                    #print "Anzahl cluster:", len(clusters)
                    mask = np.zeros(ar_in.shape,np.bool)
                    check_queue = Queue()
                    check_queue.put((i_s,i_f))
                    while not check_queue.empty():
                        pos_x,pos_y = check_queue.get()
                        if not already_visited[pos_x,pos_y]:
                            #print pos_x,pos_y
                            already_visited[pos_x,pos_y] = True
                            if ar_in[pos_x,pos_y]:
                                mask[pos_x,pos_y] = True
                                for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
                                    if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
                                        check_queue.put(coords)
                    clusters.append(mask)
    return clusters

fn = "14318737.txt"
data = []
with open(fn, "r") as f:
    labels = f.readline().rstrip("
").split()[1:]
    for line in f:
        data.append([int(v) for v in line.split()[1:]])
data = np.array(data) #np.loadtxt(fn, skiprows=1, usecols=range(1,15))#converters={0:lambda x: 0})

clusters = find_clusters(data, 0, "greater")
large_clusters = filter(lambda cl: cl.sum()>5, clusters) #Only take clusters with five or more items
large_clusters = sorted(large_clusters, key=lambda cl: -cl.sum())

plot_data = np.ma.masked_equal(data[:,:], 0)

plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto", 
           vmin=0, extent=[-0.5,plot_data.shape[1]-0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()

for cl in large_clusters:
    plt.contour(cl.astype(np.int),[.5], colors="k", lw=2)
plt.xticks(np.arange(0, len(labels)+1), labels, rotation=90, va="top", ha="center")

print "Summary of all large clusters:
"
print "#	Size	In regions"
for i, cl in enumerate(large_clusters):
    print "%i	%i	" % (i, cl.sum()),
    regions_in_cluster = np.where(np.any(cl, axis=0))[0]
    min_region = labels[min(regions_in_cluster)]
    max_region = labels[max(regions_in_cluster)]
    print "%s to %s" % (min_region, max_region)

plt.xlim(-0.5,plot_data.shape[1]-0.5)

plt.show()

I filter all clusters that have more than five points included. I plot only these. You could alternatively use the sum of data inside each cluster. I then sort these large clusters by their size, descending.

Finally, I print a summary of all large clusters, including the names of all clusters they are across. Large clusters only

Heatmap of a huge dataset

Tags:

python

heatmap

bioinformatics

Angelo

1 Answers

Thorsten Kranz

Recent Activity

Donate For Us

Heatmap of a huge dataset

Tags:

python

heatmap

bioinformatics

Angelo

1 Answers

Thorsten Kranz

Related questions

Recent Activity

Donate For Us