sadII_lab8_japonicus_genome

2387 days ago by macieksk

%html <a href="http://www.plantgdb.org/XGDB/phplib/download.php?GDB=Lj"> Genom Lotus Japonicus </a> 
%r download.file("http://www.plantgdb.org/download/Download/xGDB/LjGDB/LJcdna.bz2","data/LJcdna.bz2","wget") 
       
--2012-05-08 14:35:41-- 
http://www.plantgdb.org/download/Download/xGDB/LjGDB/LJcdna.bz2
Translacja www.plantgdb.org... 129.186.10.139
Łączenie się z www.plantgdb.org|129.186.10.139|:80... połączono.
Żądanie HTTP wysłano, oczekiwanie na odpowiedź... 200 OK
Długość: 120605 (118K) [application/x-bzip2]
Zapis do: `data/LJcdna.bz2'

 0% [                                       ] 0           --.-K/s   
13% [====>                                  ] 16.384      56,9K/s
54% [====================>                  ] 65.536       114K/s
100%[======================================>] 120.605      167K/s
w 0,7s     

2012-05-08 14:35:43 (167 KB/s) - zapisano `data/LJcdna.bz2'
[120605/120605]
%time system("bunzip2 data/LJcdna.bz2") 
       
bunzip2: Output file data/LJcdna already exists.
CPU time: 0.00 s,  Wall time: 0.08 s
dir("data") 
       
[1] "jap.nucl.Rdata"     "jap.nucl.RdataTmp"  "LJcdna"            
"LJcdna.base"       
[5] "LJcdna.bz2"         "LJcdna.nucl"        "LJcds"             
"LJcds.line.num.tab"
[9] "LJcds.trna"        
system("head data/LJcdna") system("wc -l data/LJcdna") 
       
>gi|82409050|gb|DQ249171.1|DQ249171 PLN Lotus japonicus ubiquitin
mRNA, complete cds.
GAACCCTAACAATTCTTGTTCATATCGCTTCTCTCTACTTTCAAGATGCAGATCTTCGTG
AAGACCCTCACCGGAAAGACCATCACTCTCGAGGTCGAGAGCTCCGACACCATCGACAAC
GTTAAGGCAAAGATCCAGGATAAGGAGGGGATTCCCCCGGACCAGCAGAGGTTGATCTTC
GCCGGAAAGCAGCTTGAGGATGGTCGTACCCTCGCCGACTACAACATCCAGAAGGAGTCA
ACCCTCCACCTTGTCCTCCGTCTCAGGGGAGGCATGCAGATCTTCGTGAAGACCCTCACT
GGCAAGACCATCACCCTCGAGGTCGAAAGCTCTGACACTATTGACAACGTCAAGGCTAAG
ATCCAGGACAAGGAGGGAATTCCCCCGGATCAGCAGAGGTTGATCTTCGCCGGCAAGCAG
CTCGAGGATGGTCGCACCTTGGCCGACTACAACATCCAGAAGGAGTCTACCCTCCACCTT
GTCCTCCGTCTCCGTGGTGGTATGCAGATCTTCGTCAAGACCTTGACTGGCAAGACCATC
7122 data/LJcdna
system("grep -v -e '^>' data/LJcdna > data/LJcdna.nucl") 
       
system("wc -l data/LJcdna.nucl") 
       
6862 data/LJcdna.nucl
nucl<-scan("data/LJcdna.nucl",what=character()) 
       
Read 6862 items
nucl.data<-unlist(strsplit(nucl,"")) 
       
table(nucl.data) 
       
nucl.data
     A      C      G      H      N      R      T      W 
115036  81988  93811      1     13      1 113095      7 
# Tylko ACGT nucl.data<-nucl.data[nucl.data%in%c("A","C","G","T")] 
       
table(nucl.data) 
       
nucl.data
     A      C      G      T 
115036  81988  93811 113095 
dbl.nucl<- paste( head(nucl.data,-1), tail(nucl.data,-1), sep="") table(dbl.nucl) 
       
dbl.nucl
   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG   
GT    TA    TC    TG    TT 
37153 19842 27972 30069 28877 17535  9320 26256 29646 19799 22944
21422 19360 24812 33574 35348 
table(dbl.nucl)["AA"] 
       
   AA 
37153 
# Wykonaj "Chi^2 contingency" test (?chisq.test) # oraz "Fisher exact test" (?fisher.test) # na niezaleznosc wystapienia nukleotydu, w zaleznosci od poprzedniego 
       
ntbl<-table(dbl.nucl) ntbl twod.tbl<-as.table(matrix(ntbl,byrow=TRUE,nrow=4)) colnames(twod.tbl)<-c("A","C","G","T") rownames(twod.tbl)<-c("A","C","G","T") twod.tbl 
       
dbl.nucl
   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG   
GT    TA    TC    TG    TT 
37153 19842 27972 30069 28877 17535  9320 26256 29646 19799 22944
21422 19360 24812 33574 35348 



      A     C     G     T
A 37153 19842 27972 30069
C 28877 17535  9320 26256
G 29646 19799 22944 21422
T 19360 24812 33574 35348
fisher.test(twod.tbl,hybrid=TRUE,simulate.p.value=TRUE) 
       
^CInterrupting R Interpreter...
Traceback (click to the left of this block for traceback)
...
__SAGE__
nucl.data2<-unlist(strsplit(nucl,"")) # Tylko ACGT, NA przeciwnie nucl.data2[!nucl.data2%in%c("A","C","G","T")]<-NA summary(factor(nucl.data2)) 
       
     A      C      G      T   NA's 
115036  81988  93811 113095     22 
nucl.df <- data.frame(akt=head(nucl.data2,-2), nast=head(tail(nucl.data2,-1),-1),nnast= tail(nucl.data2,-2)) head(nucl.df) 
       
  akt nast nnast
1   G    A     A
2   A    A     C
3   A    C     C
4   C    C     C
5   C    C     T
6   C    T     A
nucl.df<-transform(nucl.df,akt.pir=akt%in%c("T","C"),nast.pir=nast%in%c("T","C"),nnast.pir=nnast%in%c("T","C")) head(nucl.df) 
       
  akt nast nnast akt.pir nast.pir nnast.pir
1   G    A     A   FALSE    FALSE     FALSE
2   A    A     C   FALSE    FALSE      TRUE
3   A    C     C   FALSE     TRUE      TRUE
4   C    C     C    TRUE     TRUE      TRUE
5   C    C     T    TRUE     TRUE      TRUE
6   C    T     A    TRUE     TRUE     FALSE
nucl.df<-transform(nucl.df,nnnast=factor(c(as.character(tail(nucl.df$nnast,-1)),"A"))) nucl.df<-transform(nucl.df, nnnast.pir=nnnast%in%c("T","C") ) head(nucl.df) 
       
  akt nast nnast akt.pir nast.pir nnast.pir nnnast nnnast.pir
1   G    A     A   FALSE    FALSE     FALSE      C       TRUE
2   A    A     C   FALSE    FALSE      TRUE      C       TRUE
3   A    C     C   FALSE     TRUE      TRUE      C       TRUE
4   C    C     C    TRUE     TRUE      TRUE      T       TRUE
5   C    C     T    TRUE     TRUE      TRUE      A      FALSE
6   C    T     A    TRUE     TRUE     FALSE      A      FALSE
nucl.df<-head(nucl.df,5e4) 
       
fit1 <- lm(akt.pir ~ nast.pir,data=nucl.df) fit2 <- lm(akt.pir ~ nast.pir+nnast.pir,data=nucl.df) fit3 <- lm(akt.pir ~ nast,data=nucl.df) fit4 <- lm(akt.pir ~ nast+nnast.pir,data=nucl.df) fit5 <- lm(akt.pir ~ nast+nnast,data=nucl.df) 
       
?stepAIC 
       
WARNING: Output truncated!  
full_output.txt



stepAIC                  package:MASS                  R
Documentation

_C_h_o_o_s_e _a _m_o_d_e_l _b_y _A_I_C _i_n _a
_S_t_e_p_w_i_s_e _A_l_g_o_r_i_t_h_m

_D_e_s_c_r_i_p_t_i_o_n:

     Performs stepwise model selection by AIC.

_U_s_a_g_e:

     stepAIC(object, scope, scale = 0,
             direction = c("both", "backward", "forward"),
             trace = 1, keep = NULL, steps = 1000, use.start =
FALSE,
             k = 2, ...)
     
_A_r_g_u_m_e_n_t_s:

  object: an object representing a model of an appropriate class. 
This
          is used as the initial model in the stepwise search.

   scope: defines the range of models examined in the stepwise
search.
          This should be either a single formula, or a list
containing
          components ‘upper’ and ‘lower’, both formulae.  See the
          details for how to specify the formulae and how they are
          used.

   scale: used in the definition of the AIC statistic for selecting
the
          models, currently only for ‘lm’ and ‘aov’ models (see
          ‘extractAIC’ for details).

direction: the mode of stepwise search, can be one of ‘"both"’,
          ‘"backward"’, or ‘"forward"’, with a default of ‘"both"’. 
If
          the ‘scope’ argument is missing the default for
‘direction’
          is ‘"backward"’.

   trace: if positive, information is printed during the running of
          ‘stepAIC’.  Larger values may give more information on the
          fitting process.

    keep: a filter function whose input is a fitted model object and
          the associated ‘AIC’ statistic, and whose output is
          arbitrary.  Typically ‘keep’ will select a subset of the
          components of the object and return them. The default is
not
          to keep anything.

   steps: the maximum number of steps to be considered.  The default
is
          1000 (essentially as many as required).  It is typically
used
          to stop the process early.

use.start: if true the updated fits are done starting at the linear
          predictor for the currently selected model. This may speed
up
          the iterative calculations for ‘glm’ (and other fits), but
it
          can also slow them down. *Not used* in R.

       k: the multiple of the number of degrees of freedom used for
the
          penalty.  Only ‘k = 2’ gives the genuine AIC: ‘k = log(n)’
is
          sometimes referred to as BIC or SBC.

     ...: any additional arguments to ‘extractAIC’. (None are
currently

...


     the stepwise-selected model is returned, with up to two
additional
     components.  There is an ‘"anova"’ component corresponding to
the
     steps taken in the search, as well as a ‘"keep"’ component if
the
     ‘keep=’ argument was supplied in the call. The ‘"Resid. Dev"’
     column of the analysis of deviance table refers to a constant
     minus twice the maximized log likelihood: it will be a deviance
     only in cases where a saturated model is well-defined (thus
     excluding ‘lm’, ‘aov’ and ‘survreg’ fits, for example).

_N_o_t_e:

     The model fitting must apply the models to the same dataset. 
This
     may be a problem if there are missing values and an ‘na.action’
     other than ‘na.fail’ is used (as is the default in R).  We
suggest
     you remove the missing values first.

_R_e_f_e_r_e_n_c_e_s:

     Venables, W. N. and Ripley, B. D. (2002) _Modern Applied
     Statistics with S._ Fourth edition.  Springer.

_S_e_e _A_l_s_o:

     ‘addterm’, ‘dropterm’, ‘step’

_E_x_a_m_p_l_e_s:

     quine.hi <- aov(log(Days + 2.5) ~ .^4, quine)
     quine.nxt <- update(quine.hi, . ~ . - Eth:Sex:Age:Lrn)
     quine.stp <- stepAIC(quine.nxt,
         scope = list(upper = ~Eth*Sex*Age*Lrn, lower = ~1),
         trace = FALSE)
     quine.stp$anova
     
     cpus1 <- cpus
     attach(cpus)
     for(v in names(cpus)[2:7])
       cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])),
                         include.lowest = TRUE)
     detach()
     cpus0 <- cpus1[, 2:8]  # excludes names, authors'
predictions
     cpus.samp <- sample(1:209, 100)
     cpus.lm <- lm(log10(perf) ~ ., data = cpus1[cpus.samp,2:8])
     cpus.lm2 <- stepAIC(cpus.lm, trace = FALSE)
     cpus.lm2$anova
     
     example(birthwt)
     birthwt.glm <- glm(low ~ ., family = binomial, data = bwt)
     birthwt.step <- stepAIC(birthwt.glm, trace = FALSE)
     birthwt.step$anova
     birthwt.step2 <- stepAIC(birthwt.glm, ~ .^2 +
I(scale(age)^2)
         + I(scale(lwt)^2), trace = FALSE)
     birthwt.step2$anova
     
     quine.nb <- glm.nb(Days ~ .^4, data = quine)
     quine.nb2 <- stepAIC(quine.nb)
     quine.nb2$anova
     
library(MASS) 
       
sa<-stepAIC(fit1,scope=list(upper=akt.pir~nast+nnast+nast.pir,lower=akt.pir.~1),direction="both") 
       
Start:  AIC=-69810.57
akt.pir ~ nast.pir

           Df Sum of Sq   RSS    AIC
+ nast      2    20.582 12355 -69890
+ nnast     3    13.236 12362 -69858
<none>                  12376 -69811
- nast.pir  1   117.636 12493 -69340

Step:  AIC=-69889.8
akt.pir ~ nast.pir + nast


Step:  AIC=-69889.8
akt.pir ~ nast

        Df Sum of Sq   RSS    AIC
+ nnast  3    11.915 12343 -69932
<none>               12355 -69890
- nast   3   138.218 12493 -69340

Step:  AIC=-69932.04
akt.pir ~ nast + nnast

        Df Sum of Sq   RSS    AIC
<none>               12343 -69932
- nnast  3    11.915 12355 -69890
- nast   3   130.024 12473 -69414
summary(sa) 
       
Call:
lm(formula = akt.pir ~ nast + nnast, data = nucl.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5691 -0.4971 -0.3987  0.4892  0.6013 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.398710   0.005471  72.875  < 2e-16 ***
nastC       0.112070   0.006470  17.320  < 2e-16 ***
nastG       0.053528   0.006231   8.591  < 2e-16 ***
nastT       0.125480   0.006038  20.782  < 2e-16 ***
nnastC      0.044901   0.006486   6.923 4.49e-12 ***
nnastG      0.018315   0.006310   2.903  0.00370 ** 
nnastT      0.016829   0.006033   2.789  0.00528 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 0.4969 on 49993 degrees of freedom
Multiple R-squared: 0.01202,	Adjusted R-squared: 0.0119 
F-statistic: 101.3 on 6 and 49993 DF,  p-value: < 2.2e-16 
models<-list(fit1,fit2,fit3,fit4,fit5) 
       
lapply(models,summary) 
       
Error in lapply(models, summary) : object 'models' not found
cbind(AIC=sapply(models,BIC),BIC=sapply(models,BIC)) 
       
Error in lapply(X = X, FUN = FUN, ...) : object 'models' not found
%time save(nucl.df,models,file="jap.nucl.Rdata") 
       
CPU time: 0.00 s,  Wall time: 3.43 s
png(width=800,height=800) layout(matrix(1:4,nrow=2,ncol=2)) plot(fit1) dev.off() 
       
^CInterrupting R Interpreter...

Traceback (click to the left of this block for traceback)
...
__SAGE__
test.data<-head(nucl.df,1000) prd<-predict(fit5,test.data) head(prd) summary(prd) 
       
        1         2         3         4         5         6 
0.4027101 0.4487939 0.5439978 0.5439978 0.5215937 0.5241723 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.4027  0.4396  0.4857  0.4847  0.5343  0.5703 
#Uzupelnij ... aby powstala krzywa ROC dla predykcji z modelu liniowego library(ROCR) pred<-prediction( predict(fit5,nucl.df),nucl.df$akt.pir ) perf<-performance(pred , measure = "tpr", x.measure = "fpr") plot( perf,colorize=TRUE) dev.off() 
       
null device 
          1 
fit5.g <- glm(akt.pir ~ nast+nnast,data=nucl.df,family="binomial") 
       
summary(na.exclude(nucl.df)) 
       
 akt        nast       nnast       akt.pir         nast.pir      
nnast.pir       nnnast    
 A:115018   A:115017   A:115017   Mode :logical   Mode :logical  
Mode :logical   A:115013  
 C: 81982   C: 81981   C: 81982   FALSE:208814    FALSE:208817   
FALSE:208813    C: 81981  
 G: 93796   G: 93800   G: 93796   TRUE :195060    TRUE :195057   
TRUE :195061    G: 93803  
 T:113078   T:113076   T:113079   NA's :0         NA's :0        
NA's :0         T:113077  
 nnnast.pir     
 Mode :logical  
 FALSE:208816   
 TRUE :195058   
 NA's :0        
%time fit6 <- glm(akt.pir ~ nast*nnast*nnnast.pir, data=na.exclude(nucl.df),family="binomial") fit7 <- glm(akt.pir ~ nast*nnast*nnnast, data=na.exclude(nucl.df),family="binomial") 
       
CPU time: 0.00 s,  Wall time: 65.71 s
# Narysuj dwie krzywe ROC na jednym wykresie - jedna z modelu liniowego fit1 i druga z fit7 pred<-prediction( predict(fit1,nucl.df),nucl.df$akt.pir ) perf<-performance(pred , measure = "tpr", x.measure = "fpr") plot( perf,colorize=TRUE) pred<-prediction( predict(fit7,nucl.df),nucl.df$akt.pir ) perf<-performance(pred , measure = "tpr", x.measure = "fpr") plot( perf,colorize=TRUE,add=TRUE) dev.off() 
       
null device 
          1 
summary(fit6) summary(fit7) 
       
summary(aov(fit7)) 
       
                      Df Sum Sq Mean Sq F value Pr(>F)    
nast                   3   1051   350.4 1429.96 <2e-16 ***
nnast                  3    110    36.8  150.12 <2e-16 ***
nnnast                 3    293    97.8  399.11 <2e-16 ***
nast:nnast             9    316    35.1  143.44 <2e-16 ***
nast:nnnast            9     33     3.7   15.04 <2e-16 ***
nnast:nnnast           9     28     3.1   12.63 <2e-16 ***
nast:nnast:nnnast     27     70     2.6   10.61 <2e-16 ***
Residuals         403810  98949     0.2                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
AIC(fit6) AIC(fit7) 
       
[1] 552228.3
[1] 551854.3
anova(fit6,fit7,test="Chi") 
       
Analysis of Deviance Table

Model 1: akt.pir ~ nast * nnast * nnnast.pir
Model 2: akt.pir ~ nast * nnast * nnnast
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1    403842     552164                          
2    403810     551726 32   438.04 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
summary(aov(fit6)) 
       
                          Df Sum Sq Mean Sq F value  Pr(>F)    
nast                       3   1051   350.4 1428.53 < 2e-16 ***
nnast                      3    110    36.8  149.97 < 2e-16 ***
nnnast.pir                 1    264   263.7 1075.14 < 2e-16 ***
nast:nnast                 9    318    35.3  143.99 < 2e-16 ***
nast:nnnast.pir            3     19     6.5   26.34 < 2e-16 ***
nnast:nnnast.pir           3      8     2.8   11.40 1.8e-07 ***
nast:nnast:nnnast.pir      9     25     2.7   11.17 < 2e-16 ***
Residuals             403842  99056     0.2                    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
save.image("data/jap.nucl.Rdata") 
       
 
       
razem 112M
drwxrwxrwx 2 nbuser nbuser 4,0K 2012-04-24 12:27 .
drwxr-xr-x 5 nbuser nbuser 4,0K 2012-04-03 12:57 ..
-rw-r--r-- 1 nbuser nbuser  81M 2012-04-24 12:28 jap.nucl.RdataTmp
-rw-r--r-- 1 nbuser nbuser 430K 2010-11-08 17:03 LJcdna
-rw-r--r-- 1 nbuser nbuser 402K 2012-03-13 14:08 LJcdna.base
-rw-r--r-- 1 nbuser nbuser 118K 2010-11-08 17:03 LJcdna.bz2
-rw-r--r-- 1 nbuser nbuser 402K 2012-04-24 12:08 LJcdna.nucl
-rw-r--r-- 1 nbuser nbuser  31M 2010-11-08 17:03 LJcds
-rw-r--r-- 1 nbuser nbuser 4,0K 2012-03-30 16:35 LJcds.line.num.tab
-rw-r--r-- 1 nbuser nbuser  46K 2012-03-30 16:58 LJcds.trna
# Porownaj model zbudowany na cdna z modelem zbudowanym na cds 
       
%r #download.file("http://www.plantgdb.org/download/Download/xGDB/LjGDB/LJcds.bz2","data/LJcds.bz2","wget") system("bunzip2 data/LJcds.bz2") 
       
system("head data/LJcds") system("wc -l data/LJcds") 
       
>chr1.CM0001.10.nd + phase: 0 /partial
ATGGCTGAAAGCTCTTCAGCTTCTGCTGCGACTCCATCGAAGCCCGTTGTGGTTAGGGTC
AAACGAAAACCCTCTCAATCTCCACTTGATGCTTTCTGGTTGGAAATCAATGAGAGGCCA
TTGAAACGCTCTCTTCTGGATTTTGGAAATCTGTCAATTTCTGATTCTGCTCAGAAGG
>chr1.CM0001.20.nd + phase: 2 /pseudo/partial
GAATTTGGTTTTGCCCATCTATTCCAAGGAGGAGACTTTTAAGAATCTGGAACCACTATC
CGTCCTTAACTCTCGTGGCAGCAGCGAACACGTTCTGAAGTTCCTTCGTTACCTTGACCA
GTTCGGCGAAACCAAGGTTCACTTTGAAACATGTGCTTACTTTGGTGACTCTTATCCACC
TGGTTCGTGCCCTTGTCGTCTGAAGCAAATTTGGGATAACATACCTGTGCATAGACAAGG
AAGCAAAGCAGGAGCTGCTGTGCAAGTCCATCTCAAACAGAAGCTAAGGTCACCCCAGCG
556187 data/LJcds
system('cat data/LJcds | grep -e "^>" | wc -l') system('cat data/LJcds | grep -n -e "^>" | grep "tRNA" | wc -l ') 
       
43051
652
system('cat data/LJcds | grep -n -e "^>" -A 1 | grep "tRNA" -A 1 | head ') 
       
31:>chr1.CM0001.100.nd + phase: 0 /tRNA
32-GGGTCGTTAGCTCAGTCGGTAGAGCAGTTGACTCTTAATCAATTGGTCACAGGTTCGAAC
--
2857:>chr4.CM0007.1020.nc - phase: 0 /tRNA
2858-GGGCATTTGGTCTAGTGGTATGATTCTCGCTTAGGGTGCGAGAGGTCCCGAGTTCAATTC
--
3825:>chr4.CM0007.640.nd + phase: 0 /tRNA
3826-GCGGGGATAGCTCAGTTGGGAGAGCGTCAGACTGAAGATCTGAAGGTCGCGTGTTCGATC
--
3828:>chr4.CM0007.650.nd + phase: 0 /tRNA
system("cat data/LJcds | awk '$0~\"^>\" && f {f=0;next} $0~\"tRNA\"{ f=1;next}f ' > data/LJcds.trna") 
       
trna<-scan("data/LJcds.trna",what=character()) 
       
Read 1208 items
trna<-factor(unlist(strsplit(trna,""))) 
       
summary(trna) 
       
    a     A     c     C     g     G     t     T 
    7  9297    10 11011    10 13902    10 11060 
levels(trna) levels(trna)<-c("A","A","C","C","G","G","T","T") levels(trna) 
       
[1] "a" "A" "c" "C" "g" "G" "t" "T"

[1] "A" "C" "G" "T"
summary(trna) 
       
    A     C     G     T 
 9304 11021 13912 11070