sadII_lab8_ucsc_data

2248 days ago by macieksk

# pakiet "rtracklayer" # Ponizej przykladowa sesja z uzyciem tego pakietu, # natomiast dane mozna takze wczytac z pliku (pare komorek nizej) 
       
library(rtracklayer) session <- browserSession("UCSC") 
       
Loading required package: RCurl
Loading required package: bitops
tnames<-trackNames(session) length(tnames) head(tnames,20) 
       
[1] 146
    Base Position   Chromosome Band       STS Markers       FISH
Clones       Recomb Rate 
          "ruler"        "cytoBand"          "stsMap"     
"fishClones"      "recombRate" 
     ENCODE Pilot       Map Contigs          Assembly   GRC Map
Contigs               Gap 
  "encodeRegions"          "ctgPos"            "gold"        
"ctgPos2"             "gap" 
    BAC End Pairs  Fosmid End Pairs        GC Percent GRC Patch
Release         Hg18 Diff 
    "bacEndPairs"     "fosEndPairs"         "gc5Base"
"altSeqComposite"  "hg19ContigDiff" 
     Hi Seq Depth     NCBI Incident       Short Match     Restr
Enzymes        Wiki Track 
     "hiSeqDepth"  "ncbiIncidentDb"      "oligoMatch"        
"cutters"       "wikiTrack" 
tnames[grep("miRNA",names(tnames))] 
       
     sno/miRNA TS miRNA sites 
       "wgRna"  "targetScanS" 
%time query <- ucscTableQuery(session, "wgRna") tableNames(query) 
       
[1] "wgRna"
CPU time: 0.00 s,  Wall time: 8.32 s
tableName(query)<-"wgRna" mirna <- getTable(query) 
       
# Wczytywanie tabeli miRNA zapisanej wczesniej w innej sesji load("data/miRNA.Rdata") ls() 
       
[1] "f"       "logRfun" "mirna"   "query"   "session" "st"     
"tnames" 
dim(mirna) head(mirna) summary(mirna) 
       
[1] 1341   11
  bin chrom chromStart chromEnd           name score strand
thickStart thickEnd  type width
1 585  chr1      30365    30503 hsa-mir-1302-2     0      +         
0        0 miRNA   138
2 593  chr1    1102483  1102578   hsa-mir-200b     0      +         
0        0 miRNA    95
3 593  chr1    1103242  1103332   hsa-mir-200a     0      +         
0        0 miRNA    90
4 593  chr1    1104384  1104467    hsa-mir-429     0      +         
0        0 miRNA    83
5 608  chr1    3044538  3044599   hsa-mir-4251     0      +         
0        0 miRNA    61
6 611  chr1    3477258  3477354   hsa-mir-551a     0      -         
0        0 miRNA    96
      bin           chrom       chromStart           chromEnd       
name          score  
 Min.   : 110   chr14  :118   Min.   :    30143   Min.   :    30281 
14q(0)  :   1   Min.   :0  
 1st Qu.: 805   chr1   :115   1st Qu.: 28906892   1st Qu.: 28907024 
14q(I-1):   1   1st Qu.:0  
 Median :1062   chr15  :115   Median : 62622763   Median : 62622838 
14q(I-2):   1   Median :0  
 Mean   :1175   chr19  :105   Mean   : 77498437   Mean   : 77498529 
14q(I-3):   1   Mean   :0  
 3rd Qu.:1425   chrX   :101   3rd Qu.:110141514   3rd Qu.:110141589 
14q(I-4):   1   3rd Qu.:0  
 Max.   :2485   chr17  : 76   Max.   :249120575   Max.   :249120642 
14q(I-5):   1   Max.   :0  
                (Other):711                                         
(Other) :1335              
 strand    thickStart    thickEnd      type         width       
 -:610   Min.   :0    Min.   :0   CDBox  :269   Min.   : 44.00  
 +:731   1st Qu.:0    1st Qu.:0   HAcaBox:112   1st Qu.: 76.00  
         Median :0    Median :0   miRNA  :939   Median : 85.00  
         Mean   :0    Mean   :0   scaRna : 21   Mean   : 92.47  
         3rd Qu.:0    3rd Qu.:0                 3rd Qu.: 97.00  
         Max.   :0    Max.   :0                 Max.   :548.00  
                                                                
mirna$width<-with(mirna,chromEnd-chromStart) 
       
library(lattice) histogram(~(chromEnd-chromStart)|type,data=mirna,breaks=20) dev.off() 
       
Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
2: In max(x) : no non-missing arguments to max; returning -Inf
null device 
          1 
histogram(~width|type,data=subset(mirna,type=="miRNA"),breaks=50) dev.off() 
       
Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
2: In max(x) : no non-missing arguments to max; returning -Inf
null device 
          1 
png(width=800,height=1000) histogram(~width|chrom,data=subset(mirna,type=="miRNA"),breaks=20) dev.off() 
       
Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
2: In max(x) : no non-missing arguments to max; returning -Inf
null device 
          1 
histogram(~width|strand,data=subset(mirna,type=="miRNA"),breaks=50,type="c") dev.off() 
       
Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
2: In max(x) : no non-missing arguments to max; returning -Inf
3: In histogram.formula(~width | strand, data = subset(mirna, type
==  :
  type='count' can be misleading in this context
null device 
          1 
# Czy rozklady wielkosci na obydwu wstegach sa takie same? Zastosuj odpowiedni test. with(subset(mirna,type=="miRNA"),ks.test(width[strand=="+"],width[strand=="-"])) 
       
	Two-sample Kolmogorov-Smirnov test

data:  width[strand == "+"] and width[strand == "-"] 
D = 0.0743, p-value = 0.1493
alternative hypothesis: two-sided 

Warning message:
In ks.test(width[strand == "+"], width[strand == "-"]) :
  p-values will be approximate in the presence of ties
## Jaki to moze byc rozklad? Wykorzystujac wiedze z wykladu postaraj sie dopasowac najlepszy rozklad. 
       
names(mirna$bin) 
       
NULL
# Ilosc miRNA na chromosomach mirna.chr.counts<-table(subset(mirna,type=="miRNA")$chrom) mirna.chr.counts barchart(~sort(mirna.chr.counts)) dev.off() 
       
 chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 
chr2 chr20 chr21 chr22  chr3 
   75    36    41    32    21    69    33    28    45    14    91   
48    30    10    21    44 
 chr4  chr5  chr6  chr7  chr8  chr9  chrX 
   34    39    23    40    39    37    89 

null device 
          1 
# Pytanie: czy ilosc miRNA zalezy tylko od wielkosci chromosomu? # Czy jakis chromosom jest wyjatkowy pod tym wzgledem? 
       
# Wielkosci chromosomu: sciagamy wspolrzedne prazkow od kariotypu # Sciagnij query "cytoBand" i tabele "cytoBand" 
       
%time library(rtracklayer) session <- browserSession("UCSC") query <- ucscTableQuery(session, "cytoBand") tableNames(query) 
       
[1] "cytoBand"
CPU time: 0.00 s,  Wall time: 8.10 s
tableName(query)<-"cytoBand" cband <- getTable(query) 
       
head(cband) summary(cband) 
       
  chrom chromStart chromEnd   name gieStain
1  chr1          0  2300000 p36.33     gneg
2  chr1    2300000  5400000 p36.32   gpos25
3  chr1    5400000  7200000 p36.31     gneg
4  chr1    7200000  9200000 p36.23   gpos25
5  chr1    9200000 12700000 p36.22     gneg
6  chr1   12700000 16200000 p36.21   gpos50
     chrom       chromStart           chromEnd              name    
gieStain  
 chr1   : 63   Min.   :        0   Min.   :  2200000   p11.1  : 20  
gneg   :414  
 chr2   : 62   1st Qu.: 29000000   1st Qu.: 32475000   q11.1  : 17  
gpos50 :121  
 chr3   : 62   Median : 62950000   Median : 66650000   q22.2  : 17  
gpos75 : 89  
 chr6   : 48   Mean   : 72987007   Mean   : 76578280   q21.2  : 16  
gpos25 : 87  
 chr4   : 47   3rd Qu.:107475000   3rd Qu.:111650000   q22.1  : 16  
gpos100: 81  
 chr5   : 45   Max.   :243700000   Max.   :249250621   p11.2  : 15  
acen   : 48  
 (Other):535                                           (Other):761  
(Other): 22  
chr.sizes<-with(cband,tapply(chromEnd,chrom,max)) library(lattice) barchart(chr.sizes) dev.off() 
       
null device 
          1 
# Zadanie: czy ilosc miRNA zalezy tylko od wielkosci chromosomu? # Ktore chromosomy sa wyjatkowe pod tym wzgledem? 
       
mirna.size.data<-data.frame(mirna.cnt=c(mirna.chr.counts,0), chr.size=as.vector(chr.sizes), chr=names(chr.sizes), row.names=names(chr.sizes)) 
       
fit<-lm(mirna.cnt~chr.size,data=mirna.size.data) summary(fit) 
       
Call:
lm(formula = mirna.cnt ~ chr.size, data = mirna.size.data)

Residuals:
    Min      1Q  Median      3Q     Max 
-29.574  -9.258  -4.310   0.206  61.460 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept) 2.143e+01  1.098e+01   1.951   0.0639 .
chr.size    1.372e-07  7.794e-08   1.760   0.0923 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 21.65 on 22 degrees of freedom
Multiple R-squared: 0.1235,	Adjusted R-squared: 0.08362 
F-statistic: 3.099 on 1 and 22 DF,  p-value: 0.09225 
xyplot(mirna.cnt~chr.size, data=mirna.size.data, type=c("p","smooth","r"), auto.key=T) dev.off() 
       
null device 
          1 
png(h=800,w=800) layout(matrix(1:4,2,2)) plot(fit) dev.off() 
       
null device 
          1 
fit2<-lm(mirna.cnt~chr.size,data=subset(mirna.size.data,!chr%in%paste("chr",c(19,1,14,"X"),sep="") ) ) summary(fit2) 
       
Call:
lm(formula = mirna.cnt ~ chr.size, data = subset(mirna.size.data, 
    !chr %in% paste("chr", c(19, 1, 14, "X"), sep = "")))

Residuals:
    Min      1Q  Median      3Q     Max 
-20.372  -7.128   1.994   4.388  21.241 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) 1.116e+01  5.502e+00   2.028  0.05768 . 
chr.size    1.552e-07  4.017e-08   3.864  0.00114 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 9.547 on 18 degrees of freedom
Multiple R-squared: 0.4534,	Adjusted R-squared: 0.423 
F-statistic: 14.93 on 1 and 18 DF,  p-value: 0.001137 
png(h=800,w=800) layout(matrix(1:4,2,2)) plot(fit2) dev.off() 
       
null device 
          1 
mirna.size.data$model.fit <- NA mirna.size.data[names(fitted(fit2)),"model.fit"] <- fitted(fit2) 
       
mirna.size.data 
       
      mirna.cnt  chr.size   chr model.fit
chr1         75 249250621  chr1        NA
chr10        36 135534747 chr10  32.19372
chr11        41 135006516 chr11  32.11172
chr12        32 133851895 chr12  31.93250
chr13        21 115169878 chr13  29.03260
chr14        69 107349540 chr14        NA
chr15        33 102531392 chr15  27.07081
chr16        28  90354753 chr16  25.18070
chr17        45  81195210 chr17  23.75892
chr18        14  78077248 chr18  23.27494
chr19        91  59128983 chr19        NA
chr2         48 243199373  chr2  48.90584
chr20        30  63025520 chr20  20.93855
chr21        10  48129895 chr21  18.62639
chr22        21  51304566 chr22  19.11918
chr3         44 198022430  chr3  41.89330
chr4         34 191154276  chr4  40.82720
chr5         39 180915260  chr5  39.23786
chr6         23 171115067  chr6  37.71663
chr7         40 159138663  chr7  35.85761
chr8         39 146364022  chr8  33.87468
chr9         37 141213431  chr9  33.07518
chrX         89 155270560  chrX        NA
chrY          0  59373566  chrY  20.37168
library(lattice) xyplot(mirna.cnt + model.fit ~ chr.size,data=mirna.size.data, type=c("p","smooth"),auto.key=T, panel = function(x, y, ...) { panel.xyplot(x, y, ...) ltext(x, y-3, labels = mirna.size.data$chr, cex=0.7) }) dev.off() 
       
null device 
          1 
################################# Outliery przez MCMC ########### # Wczytywanie tabeli miRNA zapisanej wczesniej w innej sesji #save.image("data/miRNA2.Rdata") load("data/miRNA2.Rdata") ls() 
       
 [1] "cband"                "chr.sizes"            "f"              
"fit"                 
 [5] "fit2"                 "fit3"                 "initial"        
"logRfun"             
 [9] "m"                    "mirna"               
"mirna.chr.counts"     "mirna.size.data"     
[13] "mirna.size.data.mcmc" "obj"                  "query"          
"scale"               
[17] "session"              "st"                   "tnames"         
############## # Uzyj biblioteki mcmc by dopasowac model liniowy z jednoczesna estymacja outlierow library(mcmc) 
       
#Dokoncz pisanie funkcji logRfun, ktora liczy logLikelihood z modelu liniowego #uwzgledniajac prior.prob na liczbe outlierow z rozkladu Poissona logRfun<-function(data,poiss.lambda=1)function(a,b, ind ){ # Wektor indykacji wl/wyl; czy dana zmienna jest outlierem if (any(abs(ind)>2)) return(rep(-Inf,3)) ind <- ind > 0 # Zamieniamy na logical n <- sum(ind) num.out <- NROW(data)-n pois.prior <- dpois(num.out,lambda=poiss.lambda) loglik<-with(data[ind,] ,{ err2 <- ((a+chr.size*b) - mirna.cnt)^2 #wektor bledow kwadratowych sigma2 <- mean(err2) #Estymator wariancji bledu z danych #Uzupelnij kropki... # http://en.wikipedia.org/wiki/Normal_distribution loglik <- -sum(err2)/(2*sigma2) - n/2 * log(sigma2) -n/2 *log(2*pi) loglik }) c(loglik=loglik, pois.prior=pois.prior, score=loglik/n + pois.prior) } 
       
f<-logRfun(mirna.size.data,3) #Testujemy na wsp. z modelu f(coef(fit)[1], coef(fit)[2], rep(1,NROW(mirna.size.data))) logLik(fit) # Ta funkcja liczy logLik z dowolnego modelu lm, glm 
       
       loglik    pois.prior         score 
-106.80876118    0.04978707   -4.40057798 
'log Lik.' -106.8088 (df=3)
f(coef(fit)[1], coef(fit)[2], c(rep(1,NROW(mirna.size.data)-4),rep(0,4))) 
       
     loglik  pois.prior       score 
-87.3423989   0.1680314  -4.1990886 
# Unormujmy dane, by wspolczynniki mialy podobna skale mirna.size.data.mcmc<-transform(mirna.size.data, mirna.cnt.sd=sd(mirna.cnt), chr.size.sd=sd(chr.size)) mirna.size.data.mcmc$mirna.cnt<-with(mirna.size.data.mcmc,mirna.cnt/sd(mirna.cnt)) mirna.size.data.mcmc$chr.size<-with(mirna.size.data.mcmc,chr.size/sd(chr.size)) head(mirna.size.data.mcmc) 
       
      mirna.cnt chr.size   chr model.fit mirna.cnt.sd chr.size.sd
chr1  3.3164470 4.303860  chr1        NA     22.61456    57913269
chr10 1.5918945 2.340306 chr10  32.19372     22.61456    57913269
chr11 1.8129910 2.331185 chr11  32.11172     22.61456    57913269
chr12 1.4150174 2.311247 chr12  31.93250     22.61456    57913269
chr13 0.9286051 1.988661 chr13  29.03260     22.61456    57913269
chr14 3.0511312 1.853626 chr14        NA     22.61456    57913269
# Teraz obydwa wspolczynniki modelu sa rzedu ~ 1 fit3<-lm(mirna.cnt~chr.size,data=mirna.size.data.mcmc) fit3 
       
Call:
lm(formula = mirna.cnt ~ chr.size, data = mirna.size.data.mcmc)

Coefficients:
(Intercept)     chr.size  
     0.9475       0.3514  
 
       
# Uzyj mcmc by dopasowac model f<-logRfun(mirna.size.data.mcmc,1) obj<-function(v) f(v[1],v[2],v[-c(1,2)])[3] initial<- c(1,1,rep(1,NROW(mirna.size.data.mcmc))) scale<-initial*0.3 scale[1:2]<- 0.1 m<-metrop(obj, initial, nbatch=5e4, scale = scale) 
       
m$accept 
       
[1] 0.21824
rbind(apply(tail(m$batch,1e3),2,mean), apply(tail(m$batch,1e2),2,mean)) 
       
          [,1]     [,2]       [,3]       [,4]       [,5]      [,6]  
[,7]      [,8]      [,9]
[1,] 1.0902565 1.439027  1.0320760  0.7574030 0.06750177 1.0267118
-0.9044922 0.2741200 0.5401164
[2,] 0.2374091 1.945982 -0.2890466 -0.1195291 1.67434208 0.8347396
-0.8867536 0.5078999 1.7080499
        [,10]         [,11]       [,12]      [,13]     [,14]    
[,15]      [,16]      [,17]
[1,] 1.259615 -8.007389e-05  0.37745436  0.3952137 0.5572348
-1.153239  0.1922357 -0.3549457
[2,] 1.639002 -1.512139e+00 -0.01631375 -0.9182952 1.0329309 
0.445103 -0.1274226 -0.2893967
        [,18]      [,19]      [,20]     [,21]     [,22]     [,23]   
[,24]     [,25]      [,26]
[1,] 1.213400 -0.8050327 -0.9760944 0.3886073 0.1388701 0.7519106
0.9395086 0.3839869  0.9334442
[2,] 1.128681  0.4641364 -0.8211973 0.1924505 0.9506471 1.2134540
1.4565384 1.4050118 -0.8060151
acf(m$batch[1e4:3e4,1:2],lag.max=500) dev.off() 
       
null device 
          1 
# http://cran.r-project.org/web/packages/mcmc/index.html # MCMC example #http://cran.r-project.org/web/packages/mcmc/vignettes/demo.pdf ?metrop 
       
WARNING: Output truncated!  
full_output.txt



metrop                  package:mcmc                   R
Documentation

_M_e_t_r_o_p_o_l_i_s _A_l_g_o_r_i_t_h_m

_D_e_s_c_r_i_p_t_i_o_n:

     Markov chain Monte Carlo for continuous random vector using a
     Metropolis algorithm.

_U_s_a_g_e:

     metrop(obj, initial, nbatch, blen = 1, nspac = 1, scale = 1,
outfun,
         debug = FALSE, ...)
     
_A_r_g_u_m_e_n_t_s:

     obj: an R function that evaluates the log unnormalized
probability
          density of the desired equilibrium distribution of the
Markov
          chain.  First argument is the state vector of the Markov
          chain.  Other arguments arbitrary and taken from the ‘...’
          arguments of this function.  Should return ‘- Inf’ for
points
          of the state space having probability zero under the
desired
          equilibrium distribution.  Alternatively, an object of
class
          ‘"metropolis"’ from a previous run can be supplied, in
which
          case any missing arguments (including the log unnormalized
          density function) are taken from this object (up until
          version 0.7-2 this was incorrect with respect to the
‘debug’
          argument, now it applies to it too).

 initial: a real vector, the initial state of the Markov chain.

  nbatch: the number of batches.

    blen: the length of batches.

   nspac: the spacing of iterations that contribute to batches.

   scale: controls the proposal step size.  If scalar or vector, the
          proposal is ‘x + scale * z’ where ‘x’ is the current state
          and ‘z’ is a standard normal random vector.  If matrix,
the
          proposal is ‘x + scale %*% z’.

  outfun: controls the output.  If a function, then the batch means
of
          ‘outfun(state, ...)’ are returned.  If a numeric or
logical
          vector, then the batch means of ‘state[outfun]’ (if this
          makes sense).  If missing, the the batch means of ‘state’
are
          returned.

   debug: if ‘TRUE’ extra output useful for testing.

     ...: additional arguments for ‘obj’ or ‘outfun’.

_D_e_t_a_i_l_s:

     Runs a “random-walk” Metropolis algorithm with multivariate
normal
     proposal producing a Markov chain with equilibrium distribution
     having a specified unnormalized density.  Distribution must be
     continuous.  Support of the distribution is the support of the
     density specified by argument ‘obj’. The initial state must

...

     lud: the function used to calculate log unnormalized density,
          either ‘obj’ or ‘obj$lud’ from a previous run.

  nbatch: the argument ‘nbatch’ or ‘obj$nbatch’.

    blen: the argument ‘blen’ or ‘obj$blen’.

   nspac: the argument ‘nspac’ or ‘obj$nspac’.

  outfun: the argument ‘outfun’ or ‘obj$outfun’.
     Description of additional output when ‘debug = TRUE’ can be
found
     in the vignette ‘debug’ (<URL: ../doc/debug.pdf>).

_W_a_r_n_i_n_g:

     If ‘outfun’ is missing or not a function, then the log
     unnormalized density can be defined without a ... argument and
     that works fine. One can define it starting ‘ludfun <-
     function(state)’ and that works or ‘ludfun <-
function(state, foo,
     bar)’, where ‘foo’ and ‘bar’ are supplied as additional
arguments
     to ‘metrop’.

     If ‘outfun’ is a function, then both it and the log
unnormalized
     density function can be defined without ... arguments _if they
     have exactly the same arguments list_ and that works fine.
     Otherwise it doesn't work.  Start the definitions ‘ludfun <-
     function(state, foo)’ and ‘outfun <- function(state, bar)’
and you
     get an error about unused arguments.  Instead start the
     definitions ‘ludfun <- function(state, foo, ...)’ and
‘outfun <-
     function(state, bar, ...)’, supply ‘foo’ and ‘bar’ as
additional
     arguments to ‘metrop’, and that works fine.

     In short, the log unnormalized density function and ‘outfun’
need
     to have ... in their arguments list to be safe.  Sometimes it
     works when ... is left out and sometimes it doesn't.

     Of course, one can avoid this whole issue by always defining
the
     log unnormalized density function and ‘outfun’ to have only one
     argument ‘state’ and use global variables (objects in the R
global
     environment) to specify any other information these functions
need
     to use.  That too follows the R way.  But some people consider
     that bad programming practice.

_E_x_a_m_p_l_e_s:

     h <- function(x) if (all(x >= 0) && sum(x) <=
1) return(1) else return(-Inf)
     out <- metrop(h, rep(0, 5), 1000)
     out$accept
     # acceptance rate too low
     out <- metrop(out, scale = 0.1)
     out$accept
     # acceptance rate o. k. (about 25 percent)
     plot(out$batch[ , 1])
     # but run length too short (few excursions from end to end of
range)
     out <- metrop(out, nbatch = 1e4)
     out$accept
     plot(out$batch[ , 1])
     hist(out$batch[ , 1])