sadII_lab5_japonicus_genome

2303 days ago by macieksk

%html <a href="http://www.plantgdb.org/XGDB/phplib/download.php?GDB=Lj"> Genom Lotus Japonicus </a> 
%r download.file("http://www.plantgdb.org/download/Download/xGDB/LjGDB/LJcdna.bz2","data/LJcdna.bz2","wget") 
       
--2012-03-27 16:09:23-- 
http://www.plantgdb.org/download/Download/xGDB/LjGDB/LJcdna.bz2
Resolving www.plantgdb.org (www.plantgdb.org)... 129.186.10.139
Connecting to www.plantgdb.org
(www.plantgdb.org)|129.186.10.139|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120605 (118K) [application/x-bzip2]
Saving to: `data/LJcdna.bz2'

 0% [                                       ] 0           --.-K/s   
100%[======================================>] 120,605     --.-K/s
in 0.1s    

2012-03-27 16:09:23 (970 KB/s) - `data/LJcdna.bz2' saved
[120605/120605]
%time system("bunzip2 data/LJcdna.bz2") 
       
bunzip2: Output file data/LJcdna already exists.
CPU time: 0.00 s,  Wall time: 0.05 s
dir("data") 
       
[1] "LJcdna"      "LJcdna.base" "LJcdna.bz2"  "LJcdna.nucl"
system("head data/LJcdna") system("wc -l data/LJcdna") 
       
>gi|82409050|gb|DQ249171.1|DQ249171 PLN Lotus japonicus ubiquitin
mRNA, complete cds.
GAACCCTAACAATTCTTGTTCATATCGCTTCTCTCTACTTTCAAGATGCAGATCTTCGTG
AAGACCCTCACCGGAAAGACCATCACTCTCGAGGTCGAGAGCTCCGACACCATCGACAAC
GTTAAGGCAAAGATCCAGGATAAGGAGGGGATTCCCCCGGACCAGCAGAGGTTGATCTTC
GCCGGAAAGCAGCTTGAGGATGGTCGTACCCTCGCCGACTACAACATCCAGAAGGAGTCA
ACCCTCCACCTTGTCCTCCGTCTCAGGGGAGGCATGCAGATCTTCGTGAAGACCCTCACT
GGCAAGACCATCACCCTCGAGGTCGAAAGCTCTGACACTATTGACAACGTCAAGGCTAAG
ATCCAGGACAAGGAGGGAATTCCCCCGGATCAGCAGAGGTTGATCTTCGCCGGCAAGCAG
CTCGAGGATGGTCGCACCTTGGCCGACTACAACATCCAGAAGGAGTCTACCCTCCACCTT
GTCCTCCGTCTCCGTGGTGGTATGCAGATCTTCGTCAAGACCTTGACTGGCAAGACCATC
7122 data/LJcdna
system("grep -v -e '^>' data/LJcdna > data/LJcdna.nucl") 
       
system("wc -l data/LJcdna.nucl") 
       
6862 data/LJcdna.nucl
nucl<-scan("data/LJcdna.nucl",what=character()) 
       
Read 6862 items
nucl.data<-unlist(strsplit(nucl,"")) 
       
table(nucl.data) 
       
nucl.data
     A      C      G      H      N      R      T      W 
115036  81988  93811      1     13      1 113095      7 
# Tylko ACGT nucl.data<-nucl.data[nucl.data%in%c("A","C","G","T")] 
       
table(nucl.data) 
       
nucl.data
     A      C      G      T 
115036  81988  93811 113095 
dbl.nucl<- paste( head(nucl.data,-1), tail(nucl.data,-1), sep="") table(dbl.nucl) 
       
dbl.nucl
   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG   
GT    TA    TC    TG    TT 
37153 19842 27972 30069 28877 17535  9320 26256 29646 19799 22944
21422 19360 24812 33574 35348 
table(dbl.nucl)["AA"] 
       
   AA 
37153 
# Wykonaj "Chi^2 contingency" test (?chisq.test) # oraz "Fisher exact test" (?fisher.test) # na niezaleznosc wystapienia nukleotydu, w zaleznosci od poprzedniego 
       
ntbl<-table(dbl.nucl) ntbl twod.tbl<-as.table(matrix(ntbl,byrow=TRUE,nrow=4)) colnames(twod.tbl)<-c("A","C","G","T") rownames(twod.tbl)<-c("A","C","G","T") twod.tbl 
       
dbl.nucl
   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG   
GT    TA    TC    TG    TT 
37153 19842 27972 30069 28877 17535  9320 26256 29646 19799 22944
21422 19360 24812 33574 35348 



      A     C     G     T
A 37153 19842 27972 30069
C 28877 17535  9320 26256
G 29646 19799 22944 21422
T 19360 24812 33574 35348
fisher.test(twod.tbl,hybrid=TRUE,simulate.p.value=TRUE) 
       
	Fisher's Exact Test for Count Data with simulated p-value (based on
2000 replicates)

data:  twod.tbl 
p-value = 0.0004998
alternative hypothesis: two.sided 
nucl.data2<-unlist(strsplit(nucl,"")) # Tylko ACGT, NA przeciwnie nucl.data2[!nucl.data2%in%c("A","C","G","T")]<-NA summary(factor(nucl.data2)) 
       
     A      C      G      T   NA's 
115036  81988  93811 113095     22 
nucl.df <- data.frame(akt=head(nucl.data2,-2), nast=head(tail(nucl.data2,-1),-1),nnast= tail(nucl.data2,-2)) head(nucl.df) 
       
  akt nast nnast
1   G    A     A
2   A    A     C
3   A    C     C
4   C    C     C
5   C    C     T
6   C    T     A
nucl.df<-transform(nucl.df,akt.pir=akt%in%c("T","C"),nast.pir=nast%in%c("T","C"),nnast.pir=nnast%in%c("T","C")) head(nucl.df) 
       
  akt nast nnast akt.pir nast.pir nnast.pir
1   G    A     A   FALSE    FALSE     FALSE
2   A    A     C   FALSE    FALSE      TRUE
3   A    C     C   FALSE     TRUE      TRUE
4   C    C     C    TRUE     TRUE      TRUE
5   C    C     T    TRUE     TRUE      TRUE
6   C    T     A    TRUE     TRUE     FALSE
fit1 <- lm(akt.pir ~ nast.pir,data=nucl.df) fit2 <- lm(akt.pir ~ nast.pir+nnast.pir,data=nucl.df) fit3 <- lm(akt.pir ~ nast,data=nucl.df) fit4 <- lm(akt.pir ~ nast+nnast.pir,data=nucl.df) fit5 <- lm(akt.pir ~ nast+nnast,data=nucl.df) 
       
models<-list(fit1,fit2,fit3,fit4,fit5) 
       
lapply(models,summary) 
       
[[1]]

Call:
lm(formula = akt.pir ~ nast.pir, data = nucl.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5329 -0.4363 -0.4363  0.4672  0.5637 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.436314   0.001088  400.91   <2e-16 ***
nast.pirTRUE 0.096534   0.001566   61.64   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4974 on 403948 degrees of freedom
Multiple R-squared: 0.009319,	Adjusted R-squared: 0.009316 
F-statistic:  3800 on 1 and 403948 DF,  p-value: < 2.2e-16 


[[2]]

Call:
lm(formula = akt.pir ~ nast.pir + nnast.pir, data = nucl.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5462 -0.4525 -0.4238  0.4824  0.5762 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.423827   0.001286  329.52   <2e-16 ***
nast.pirTRUE  0.093771   0.001573   59.62   <2e-16 ***
nnast.pirTRUE 0.028619   0.001573   18.20   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4972 on 403947 degrees of freedom
Multiple R-squared: 0.01013,	Adjusted R-squared: 0.01013 
F-statistic:  2067 on 2 and 403947 DF,  p-value: < 2.2e-16 


[[3]]

Call:
lm(formula = akt.pir ~ nast, data = nucl.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5447 -0.4572 -0.4193  0.4835  0.5807 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.419304   0.001466  286.09   <2e-16 ***
nastC       0.097199   0.002272   42.78   <2e-16 ***
nastG       0.037940   0.002187   17.35   <2e-16 ***
nastT       0.125394   0.002082   60.24   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4971 on 403924 degrees of freedom
  (22 observations deleted due to missingness)
Multiple R-squared: 0.01042,	Adjusted R-squared: 0.01042 
F-statistic:  1418 on 3 and 403924 DF,  p-value: < 2.2e-16 


[[4]]

Call:
lm(formula = akt.pir ~ nast + nnast.pir, data = nucl.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5581 -0.4733 -0.4069  0.4988  0.5931 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.406915   0.001616  251.81   <2e-16 ***
nastC         0.094335   0.002277   41.44   <2e-16 ***
nastG         0.037783   0.002186   17.28   <2e-16 ***
nastT         0.122592   0.002086   58.76   <2e-16 ***
nnast.pirTRUE 0.028556   0.001572   18.17   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4969 on 403923 degrees of freedom
  (22 observations deleted due to missingness)
Multiple R-squared: 0.01123,	Adjusted R-squared: 0.01122 
F-statistic:  1147 on 4 and 403923 DF,  p-value: < 2.2e-16 


[[5]]

Call:
lm(formula = akt.pir ~ nast + nnast, data = nucl.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5703 -0.4857 -0.4027  0.5021  0.5973 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.402710   0.001887 213.405  < 2e-16 ***
nastC       0.095204   0.002284  41.684  < 2e-16 ***
nastG       0.036925   0.002187  16.881  < 2e-16 ***
nastT       0.121462   0.002098  57.906  < 2e-16 ***
nnastC      0.046084   0.002284  20.177  < 2e-16 ***
nnastG      0.010109   0.002221   4.552 5.32e-06 ***
nnastT      0.023680   0.002097  11.294  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 0.4968 on 403902 degrees of freedom
  (41 observations deleted due to missingness)
Multiple R-squared: 0.01152,	Adjusted R-squared: 0.0115 
F-statistic: 784.4 on 6 and 403902 DF,  p-value: < 2.2e-16 
cbind(AIC=sapply(models,BIC),BIC=sapply(models,BIC)) 
       
          AIC      BIC
[1,] 582152.7 582152.7
[2,] 581834.6 581834.6
[3,] 581696.7 581696.7
[4,] 581379.7 581379.7
[5,] 581261.6 581261.6
plot(fit4) dev.off() 
       
There were 50 or more warnings (use warnings() to see the first 50)
null device 
          1 



test.data<-head(nucl.df,1000) prd<-predict(fit5,test.data) head(prd) summary(prd) 
       
        1         2         3         4         5         6 
0.4027101 0.4487939 0.5439978 0.5439978 0.5215937 0.5241723 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.4027  0.4396  0.4857  0.4847  0.5343  0.5703 
#Uzupelnij ... aby powstala krzywa ROC dla predykcji z modelu liniowego library(ROCR) pred<-prediction( ... ) perf<-performance(pred , measure = "tpr", x.measure = "fpr") plot( perf,colorize=TRUE) dev.off()