<?xml version='1.0'?>
<!DOCTYPE art SYSTEM 'http://www.biomedcentral.com/xml/article.dtd'>
<art>
   <ui>gb-2008-9-12-r175</ui>
   <ji>GBJ</ji>
   <fm>
      <dochead>Method</dochead>
      <bibl>
         <title>
            <p>Annotating genomes with massive-scale RNA sequencing</p>
         </title>
         <aug>
            <au id="A1" ce="yes">
               <snm>Denoeud</snm>
               <fnm>France</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>fdenoeud@genoscope.cns.fr</email>
            </au>
            <au id="A2" ca="yes" ce="yes">
               <snm>Aury</snm>
               <fnm>Jean-Marc</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>jmaury@genoscope.cns.fr</email>
            </au>
            <au id="A3">
               <snm>Da Silva</snm>
               <fnm>Corinne</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>dasilva@genoscope.cns.fr</email>
            </au>
            <au id="A4">
               <snm>Noel</snm>
               <fnm>Benjamin</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>bnoel@genoscope.cns.fr</email>
            </au>
            <au id="A5">
               <snm>Rogier</snm>
               <fnm>Odile</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>orogier@genoscope.cns.fr</email>
            </au>
            <au id="A6">
               <snm>Delledonne</snm>
               <fnm>Massimo</fnm>
               <insr iid="I4"/>
               <email>massimo.delledonne@univr.it</email>
            </au>
            <au id="A7">
               <snm>Morgante</snm>
               <fnm>Michele</fnm>
               <insr iid="I5"/>
               <email>michele.morgante@uniud.it</email>
            </au>
            <au id="A8">
               <snm>Valle</snm>
               <fnm>Giorgio</fnm>
               <insr iid="I6"/>
               <email>giorgio.valle@unipd.it</email>
            </au>
            <au id="A9">
               <snm>Wincker</snm>
               <fnm>Patrick</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>pwincker@genoscope.cns.fr</email>
            </au>
            <au id="A10">
               <snm>Scarpelli</snm>
               <fnm>Claude</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>claude@genoscope.cns.fr</email>
            </au>
            <au id="A11">
               <snm>Jaillon</snm>
               <fnm>Olivier</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>ojaillon@genoscope.cns.fr</email>
            </au>
            <au id="A12">
               <snm>Artiguenave</snm>
               <fnm>Fran&#231;ois</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <insr iid="I3"/>
               <email>artigue@genoscope.cns.fr</email>
            </au>
         </aug>
         <insg>
            <ins id="I1">
               <p>CEA, DSV, Institut de G&#233;nomique, Genoscope, 2 rue Gaston Cr&#233;mieux, CP5706, 91057 Evry, France</p>
            </ins>
            <ins id="I2">
               <p>CNRS, UMR 8030, 2 rue Gaston Cr&#233;mieux, CP5706, 91057 Evry, France</p>
            </ins>
            <ins id="I3">
               <p>Universit&#233; d'Evry, 91057 Evry, France</p>
            </ins>
            <ins id="I4">
               <p>Scientific and Technology Department, strada le Grazie 15, 37134 Verona, Italy</p>
            </ins>
            <ins id="I5">
               <p>Istituto di Genomica Applicata, Parco Scientifico e Tecnologico di Udine, Via Linussio 51, 33100 Udine, Italy</p>
            </ins>
            <ins id="I6">
               <p>CRIBI, Universit&#224; degli Studi di Padova, viale G. Colombo, 35121 Padova, Italy</p>
            </ins>
         </insg>
         <source>Genome Biology</source>
         <issn>1465-6906</issn>
         <pubdate>2008</pubdate>
         <volume>9</volume>
         <issue>12</issue>
         <fpage>R175</fpage>
         <url>http://genomebiology.com/2008/9/12/R175</url>
         <xrefbib>
            <pubidlist>
               <pubid idtype="pmpid">19087247</pubid>
               <pubid idtype="doi">10.1186/gb-2008-9-12-r175</pubid>
            </pubidlist>
         </xrefbib>
      </bibl>
      <history>
         <rec>
            <date>
               <day>9</day>
               <month>9</month>
               <year>2008</year>
            </date>
         </rec>
         <revrec>
            <date>
               <day>30</day>
               <month>10</month>
               <year>2008</year>
            </date>
         </revrec>
         <acc>
            <date>
               <day>16</day>
               <month>12</month>
               <year>2008</year>
            </date>
         </acc>
         <pub>
            <date>
               <day>16</day>
               <month>12</month>
               <year>2008</year>
            </date>
         </pub>
      </history>
      <cpyrt>
         <year>2008</year>
         <collab>Denoeud et al.; licensee BioMed Central Ltd.</collab>
         <note>This is an open access article distributed under the terms of the Creative Commons Attribution License (<url>http://creativecommons.org/licenses/by/2.0</url>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</note>
      </cpyrt>
      <shorttitle>
         <p>G-Mo.R-Se: gene modeling using RNA-Seq</p>
      </shorttitle>
      <shortabs>
         <p>A method for de novo genome annotation using high-throughput cDNA sequencing data.</p>
      </shortabs>
      <abs>
         <sec>
            <st>
               <p>Abstract</p>
            </st>
            <p>Next generation technologies enable massive-scale cDNA sequencing (so-called RNA-Seq). Mainly because of the difficulty of aligning short reads on exon-exon junctions, no attempts have been made so far to use RNA-Seq for building gene models <it>de novo</it>, that is, in the absence of a set of known genes and/or splicing events. We present <it>G-Mo.R-Se </it>(Gene Modelling using RNA-Seq), an approach aimed at building gene models directly from RNA-Seq and demonstrate its utility on the grapevine genome.</p>
         </sec>
      </abs>
   </fm>
   <meta>
      <classifications>
         <classification type="BMC" subtype="man_spc_id" id="30010010">Genome studies</classification>
         <classification type="BMC" subtype="man_spc_id" id="30010013">Methods</classification>
      </classifications>
   </meta>
   <bdy>
      <sec>
         <st>
            <p>Background</p>
         </st>
         <p>Next generation sequencing technologies generate many short reads of DNA fragments in a reduced time scale and have lowered the cost per nucleotide <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr></abbrgrp>. Genomic short reads have been used to investigate genetic variation <abbrgrp><abbr bid="B3">3</abbr></abbrgrp>, genomic rearrangements <abbrgrp><abbr bid="B4">4</abbr></abbrgrp>, DNA methylation <abbrgrp><abbr bid="B5">5</abbr></abbrgrp>, and transcription factor binding sites (Chip-Seq) <abbrgrp><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr></abbrgrp>. New algorithms had to be developed for genome resequencing, in order to map very high numbers of reads efficiently <abbrgrp><abbr bid="B8">8</abbr><abbr bid="B9">9</abbr><abbr bid="B10">10</abbr><abbr bid="B11">11</abbr></abbrgrp>, as well as for <it>de novo </it>genome assemblies, in order to cope with the short length of reads (usually less than 35 nucleotides) <abbrgrp><abbr bid="B12">12</abbr><abbr bid="B13">13</abbr><abbr bid="B14">14</abbr><abbr bid="B15">15</abbr><abbr bid="B16">16</abbr></abbrgrp>. The next-generation sequencing methods have also been applied to sequence cDNAs rather than genomic DNA, in order to catalogue microRNAs <abbrgrp><abbr bid="B17">17</abbr><abbr bid="B18">18</abbr><abbr bid="B19">19</abbr></abbrgrp> or analyze the transcriptional landscape of a number of eukaryotic genomes: this technology is called RNA-Seq <abbrgrp><abbr bid="B20">20</abbr><abbr bid="B21">21</abbr><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr><abbr bid="B25">25</abbr><abbr bid="B26">26</abbr></abbrgrp>.</p>
         <p>Before the development of the RNA-Seq technology, large-scale RNA analysis could be performed with two types of approaches. The first, tag-based approaches <abbrgrp><abbr bid="B27">27</abbr></abbrgrp>, such as serial analysis of gene expression (SAGE) <abbrgrp><abbr bid="B28">28</abbr></abbrgrp> and massively parallel signature sequencing (MPSS) <abbrgrp><abbr bid="B29">29</abbr></abbrgrp>, were based on the sequencing of previously cloned tags located in specific transcript locations (usually 3' or 5' ends). Transcript abundance could be derived from tag counts in already known loci, but no new genes or new alternative splice forms could be discovered. The alternative approach, hybridization-based microarrays, has the potential of monitoring the expression level on the whole transcriptome (not necessarily biased towards known genes, when using whole genome tiling arrays <abbrgrp><abbr bid="B30">30</abbr><abbr bid="B31">31</abbr><abbr bid="B32">32</abbr></abbrgrp>) at low cost, but it is biased by the background levels of hybridization and the fact that probes differ in their hybridization properties. Nevertheless, the gold standard method for transcript discovery remains expressed sequence tag (EST) sequencing (by Sanger technology) of cloned cDNAs <abbrgrp><abbr bid="B33">33</abbr><abbr bid="B34">34</abbr><abbr bid="B35">35</abbr></abbrgrp>. Its main limitation, in addition to the relatively high cost, is that this method is sensitive to cloning biases. The RNA-Seq technology combines the advantages of the previous large-scale RNA analysis methods by enabling the monitoring of the transcriptional landscape of a whole genome at low cost, without the biases introduced by arrays, and has the additional advantage of providing information on the transcript structures (exon-exon boundaries), as EST Sanger type sequencing does on a longer range, but without cloning biases. Moreover, because a large number of reads can easily be obtained, RNA-Seq is sensitive enough to detect transcription for genes with low expression levels, which are usually missed by EST analysis <abbrgrp><abbr bid="B21">21</abbr><abbr bid="B23">23</abbr><abbr bid="B25">25</abbr></abbrgrp>.</p>
         <p>In recent studies, RNA-Seq has mainly been used to quantify the expression levels of already annotated loci, identify differentially expressed genes, and measure expression outside of those loci (in intronic or intergenic regions) <abbrgrp><abbr bid="B21">21</abbr><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr><abbr bid="B26">26</abbr></abbrgrp>. Additionally, structural information has been used to detect already known alternative splice forms <abbrgrp><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr></abbrgrp>, identify new transcriptional events in relation to known loci (alternative splicing, 5' ends) <abbrgrp><abbr bid="B24">24</abbr><abbr bid="B26">26</abbr></abbrgrp>, and refine annotated gene structures or propose novel gene models <abbrgrp><abbr bid="B21">21</abbr><abbr bid="B23">23</abbr></abbrgrp>. However, no attempts have been made to take advantage of the connectivity information contained in RNA-Seq data for building gene models <it>de novo</it>, that is, in the absence of a set of known genes and/or splicing events.</p>
         <p>Traditionally, EST, cDNA and protein sequences are the most accurate resource for identifying gene loci and annotating the exon/intron structure on genomic sequences <abbrgrp><abbr bid="B36">36</abbr></abbrgrp>. These resources can be mapped on a genomic sequence with a global alignment strategy that allows gap insertions of genomic regions corresponding to potential introns bordered by splice sites <abbrgrp><abbr bid="B37">37</abbr><abbr bid="B38">38</abbr><abbr bid="B39">39</abbr><abbr bid="B40">40</abbr><abbr bid="B41">41</abbr></abbrgrp>. The resulting positions of exon and intron boundaries can then be assembled to build complete transcript structures <abbrgrp><abbr bid="B42">42</abbr></abbrgrp>. But the methods used to build spliced alignments of ESTs on genomes are not applicable to short reads, since they require that the sequence blocks surrounding a splice junction are long enough and highly similar to the genomic region in order to build a non-ambiguous alignment covering the exon-exon boundary. New methods are now emerging for building spliced alignments of short sequence reads <abbrgrp><abbr bid="B43">43</abbr></abbrgrp>. However, they still require <it>a priori </it>information about the genome analyzed (splice site characteristics) in order to reduce the number of junctions to test, since testing all possible 'GT/C-AG' pairs in a genome is obviously unfeasible.</p>
         <p>In this study, we present a method aimed at using RNA-Seq short reads to build <it>de novo </it>gene models. First, candidate exons are built directly from the positions of the reads mapped on the genome (without <it>ab initio </it>assembly of the reads), and then all possible splice junctions between those exons are tested against unmapped reads: the testing of junctions is directed by the information available in the RNA-Seq dataset rather than by <it>a priori </it>knowledge about the genome. Exons can then be chained into stranded gene models. We demonstrate the feasibility of this method, which we call <it>G-Mo.R-Se </it>(for Gene Modelling using RNA-Seq), on the grapevine genome <abbrgrp><abbr bid="B44">44</abbr></abbrgrp> using approximately 175 million Solexa/Illumina RNA-Seq reads from four tissues. This allowed the identification of new exons (in known loci) and alternative splice forms, as well as entirely new loci. We show that this approach is an efficient alternative to standard cDNA sequencing: it detects more transcripts at lower cost. It could be particularly helpful in the case of species for which few resources are available (that is, that are very distant from the species currently present in the ESTs/protein databases). <it>G-Mo.R-Se </it>can also be combined with other data into an automatic or manual eukaryotic genome annotation. All the data described in this article are available from the <it>G-Mo.R-Se </it>website <abbrgrp><abbr bid="B45">45</abbr></abbrgrp>.</p>
      </sec>
      <sec>
         <st>
            <p>Results and discussion</p>
         </st>
         <sec>
            <st>
               <p>Building gene models from RNA-Seq reads</p>
            </st>
            <p>We obtained 173 million Solexa/Illumina RNA-Seq reads from mRNAs extracted from four tissues (leaf, root, stem, callus). Of these, 138 million reads could be mapped unambiguously with SOAP (Short Oligonucleotide Analysis Package) <abbrgrp><abbr bid="B8">8</abbr></abbrgrp> to the <it>Vitis vinifera </it>genome sequence assembly <abbrgrp><abbr bid="B44">44</abbr></abbrgrp>. The mapped reads were contiged to build candidate exons, which we call 'covtigs' (for coverage contigs, that is, regions obtained by contiging adjacent positions with coverage depth greater than a threshold). Candidate junctions between covtigs were then tested using the unmapped reads. Finally, a graph approach was used to chain the exons through validated junctions into gene models (see Materials and methods; Figure <figr fid="F1">1</figr>). All possible chainings between exons were retained, which allowed the annotation of alternative splice forms. The covtigs that were not involved in any validated junction were discarded, implying that no mono-exonic transcripts were annotated. The procedure, which we named <it>G-Mo.R-Se</it>, produced 46,062 transcript models, clustered in 19,486 loci (an average of 2.4 transcripts per locus). A plausible coding sequence (CDS) was found for 28,399 models, clustered in 12,341 loci.</p>
            <fig id="F1">
               <title>
                  <p>Figure 1</p>
               </title>
               <caption>
                  <p><it>G-Mo.R-Se </it>method for building gene models from short reads</p>
               </caption>
               <text>
                  <p><b><it>G-Mo.R-Se </it>method for building gene models from short reads</b>. The five black boxes show the 5 steps of the approach. Step 1 (covtig construction) is the construction of covtigs (coverage contigs), which are built from positions where short reads are mapped above a given depth threshold. Step 2 (candidate exons) is the definition of a list of stranded candidate exons derived from each covtig. Splice sites are searched 100 nucleotides around each covtig boundary, which allows the orientation of the candidate exons on the forward or the reverse strand, as shown in the second box. Step 3 (junction validation) consists of the validation of junctions between candidate exons using a word dictionary built from the unmapped reads. During step 4 (graph of candidates exons linked by validated junctions), a graph is created where nodes are candidate exons (black boxes) and oriented edges (purple arrows) between two nodes represent validated junctions. The two last connected components show an example of a split gene that can be corrected using open reading frame detection between the last exon of the first model and the first exon of the second model. In the final step, step 5 (model construction and coding sequence detection) we go through the previous graph and extract all possible paths between each source and each sink. Each path will then represent a predicted transcript, and a CDS will be identified for each transcript. Models M<sub>1</sub>, M<sub>2</sub>, M<sub>5 </sub>and M<sub>7 </sub>(untranslated regions are in grey, introns in black and coding exons in red) correctly model real transcripts T<sub>1</sub>, T<sub>2</sub>, T<sub>3 </sub>and T<sub>5 </sub>(untranslated regions are in grey, and introns and exons are indicated by black lines and boxes, respectively). As all possible paths are extracted from the graph, some of them may not correspond to real transcripts (for example, models M<sub>3</sub>, M<sub>4 </sub>and M<sub>6</sub>).</p>
               </text>
               <graphic file="gb-2008-9-12-r175-1"/>
            </fig>
            <p>Covtig definition was essential for the subsequent testing of junctions to be efficient, especially with respect to the splits and fusions of exons (see Materials and methods). The splitting of exons into separate covtigs can occur when the read coverage depth goes down (below the depth threshold used for building covtigs), which can be due either to repeated regions (we only retained the reads that mapped at a unique position on the genome), to mismatches/gaps in the genomic sequence (we only kept the reads mapped with at most two mismatches and no indels), or to experimental biases leading to depth variations in the cDNAs sequenced and to non-normalization of the library. Indeed, some biases in the coverage uniformity of reads have been observed in previous RNA-Seq studies <abbrgrp><abbr bid="B23">23</abbr></abbrgrp>.</p>
            <p>We aimed at correcting the splits in two ways. First, at the covtig definition step (step 1 in Figure <figr fid="F1">1</figr>), we extended the covtigs using all 16-mers found in the reads, in order to step over mismatches and short repeats. Then, at the model building step (step 4 in Figure <figr fid="F1">1</figr>), we fused together models that were linked by an open reading frame.</p>
            <p>The artifactual fusing of exons into one single covtig can occur when the mRNA sample contains immature transcripts with retained introns, providing reads that map into the introns. Since the immature transcripts are expected to be under-represented in the set of mRNAs, the depth in the retained introns is expected to be lower than in the adjacent exons: setting an appropriate depth threshold for the building of covtigs should avoid such fusions.</p>
            <p>The depth threshold used for covtig construction was set to balance the number of splits and the number of fusions. Indeed, low thresholds will generate few splits but numerous fusions, and conversely, high thresholds will generate few fusions but numerous splits. In order to correct more fusions, we could extend the testing of junctions inside the covtigs, instead of testing junctions only between covtigs.</p>
            <p>We evaluated the direct mapping of reads, the initial candidate exons (covtigs), and the final models produced by <it>G-Mo.R-Se </it>at the nucleotide level in comparison to the reference <it>V. vinifera </it>annotation <abbrgrp><abbr bid="B44">44</abbr></abbrgrp> (Table <tblr tid="T1">1</tblr>). The depth threshold set to build the covtigs discards most of the noise (63% of the nucleotides covered by reads are located in intergenic or intronic compartments compared to only 40% of the nucleotides covered by covtigs) while retaining the signal falling in exons (66% of the exonic nucleotides are covered by reads, and 56% are covered by covtigs). This noise is likely to correspond to transcriptional background, expression of transposable elements, or genomic contamination in the samples sequenced, rather than to SOAP mapping artifacts, since we only retained positions where reads could be mapped uniquely, with at most two mismatches. When considering final models instead of initial covtigs, the sensitivity decreases slightly (from 56% to 43% of exonic bases covered) but the specificity increases greatly (from 60% to 80% of the nucleotides - in covtigs or models - fall in the exonic compartment), suggesting that most of the covtigs that could not be linked to any other covtig resulted from transcriptional or experimental noise. The models still include about 1% of the nucleotides from the intergenic compartment, indicating that this compartment harbors new, previously unannotated, genes.</p>
            <tbl id="T1">
               <title>
                  <p>Table 1</p>
               </title>
               <caption>
                  <p>Nucleotidic overlap of RNA-Seq reads, <it>G-Mo.R-Se </it>covtigs and <it>G-Mo.R-Se </it>models with different genomic compartments relative to the reference annotation</p>
               </caption>
               <tblbdy cols="7">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="6" ca="center">
                        <p>Genomic compartment relative to the reference annotation (%)</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="6">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Exonic: 41,603,635 nucleotides</p>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Intronic: 184,047,761 nucleotides</p>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Intergenic: 271,857,375 nucleotides</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>Specificity</p>
                     </c>
                     <c ca="center">
                        <p>Sensitivity</p>
                     </c>
                     <c ca="center">
                        <p>Specificity</p>
                     </c>
                     <c ca="center">
                        <p>Sensitivity</p>
                     </c>
                     <c ca="center">
                        <p>Specificity</p>
                     </c>
                     <c ca="center">
                        <p>Sensitivity</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="7">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Reads: 73,580,625 nucleotides</p>
                     </c>
                     <c ca="center">
                        <p>37</p>
                     </c>
                     <c ca="center">
                        <p>66</p>
                     </c>
                     <c ca="center">
                        <p>40</p>
                     </c>
                     <c ca="center">
                        <p>16</p>
                     </c>
                     <c ca="center">
                        <p>23</p>
                     </c>
                     <c ca="center">
                        <p>6</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Covtigs: 38,484,212 nucleotides</p>
                     </c>
                     <c ca="center">
                        <p>60</p>
                     </c>
                     <c ca="center">
                        <p>56</p>
                     </c>
                     <c ca="center">
                        <p>20</p>
                     </c>
                     <c ca="center">
                        <p>4</p>
                     </c>
                     <c ca="center">
                        <p>20</p>
                     </c>
                     <c ca="center">
                        <p>3</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Models: 22,213,316 nucleotides</p>
                     </c>
                     <c ca="center">
                        <p>80</p>
                     </c>
                     <c ca="center">
                        <p>43</p>
                     </c>
                     <c ca="center">
                        <p>5</p>
                     </c>
                     <c ca="center">
                        <p>1</p>
                     </c>
                     <c ca="center">
                        <p>15</p>
                     </c>
                     <c ca="center">
                        <p>1</p>
                     </c>
                  </r>
               </tblbdy>
               <tblfn>
                  <p>Specificity reflects the percentage of nucleotides in reads/covtigs/models falling in the compartment; sensitivity reflects the percentage of nucleotides in the genomic compartment overlapped by reads/covtigs/models.</p>
               </tblfn>
            </tbl>
            <p>We managed to select a satisfying depth threshold with respect to the splits/fusions (Figure S1 in Additional data file 1), as well as the signal/noise ratios. Obviously, the optimal depth threshold will be highly dependent on the characteristics of the dataset analyzed, such as the complexity of the transcriptome, the amount of alternative splicing, the amount of transcription outside of protein-coding genes, and the sequencing depth, and must be carefully selected in order for <it>G-Mo.R-Se </it>to work optimally.</p>
         </sec>
         <sec>
            <st>
               <p>Comparing the <it>G-Mo.R-Se </it>pipeline with direct assembly of reads</p>
            </st>
            <p>We compared the final <it>G-Mo.R-Se </it>models and the structures obtained by assembling the reads with Velvet <abbrgrp><abbr bid="B14">14</abbr></abbrgrp> and mapping the assembled contigs to the genome with est2genome <abbrgrp><abbr bid="B37">37</abbr></abbrgrp> (Table <tblr tid="T2">2</tblr>). Fewer reference genes are overlapped (on at least one nucleotide) by spliced Velvet contigs than by models (40.3% and 50.3%, respectively). The number of genes overlapped on at least 75% of their nucleotides drops even more for Velvet contigs compared to <it>G-Mo.R-Se </it>models (from 30.6% to 11.8%), indicating that most of the genes that are overlapped by Velvet contigs are not covered over their whole length. The average number of models or Velvet contigs per gene - 1.28 and 2.05, respectively - also reflects that the reference genes are more fragmented by Velvet contigs than by <it>G-Mo.R-Se </it>models. Additionally, we investigated the accuracy of the <it>G-Mo.R-Se </it>models and Velvet contigs on the structural point of view using a collection of cDNAs: 56% of the cDNA loci are predicted exactly (all exon/intron boundaries) by <it>G-Mo.R-Se </it>models, and 32% by Velvet contigs (Table S1 in Additional data file 1). We compared the average coverage depth of reference genes that are correctly annotated by <it>G-Mo.R-Se </it>models and Velvet contigs (that is, that have at least 75% of their nucleotides covered). A minimal depth of 4 is sufficient for <it>G-Mo.R-Se </it>models to annotate genes satisfactorily, whereas a minimal depth of 13 is required for Velvet contigs (Figure <figr fid="F2">2</figr>). Since <it>G-Mo.R-Se </it>relies on the genome sequence, no significant overlap between reads is necessary to put them together in a covtig: they just need to be adjacent on the genome. This explains why a much lower coverage depth is required for <it>G-Mo.R-Se </it>than for Velvet. Unlike direct assembly of reads, the <it>G-Mo.R-Se </it>pipeline is able to detect transcripts that are weakly represented in the reads set (either because they are weakly expressed or problematic to extract).</p>
            <fig id="F2">
               <title>
                  <p>Figure 2</p>
               </title>
               <caption>
                  <p>Read coverage depth for reference genes overlapped by <it>G-Mo.R-Se </it>models and Velvet contigs</p>
               </caption>
               <text>
                  <p><b>Read coverage depth for reference genes overlapped by <it>G-Mo.R-Se </it>models and Velvet contigs</b>. The distribution of the average depth (log) on all exonic nucleotides of the genes is plotted for genes overlapped on &#8805; 75% of their nucleotides by <it>G-Mo.R-Se </it>models (red line) and Velvet contig (dashed purple line). The y-axis corresponds to the percentage of reference genes in each bin (bin width is 0.2).</p>
               </text>
               <graphic file="gb-2008-9-12-r175-2"/>
            </fig>
            <tbl id="T2">
               <title>
                  <p>Table 2</p>
               </title>
               <caption>
                  <p>Overlap of the 30,434 reference genes with Velvet spliced contigs and <it>G-Mo.R-Se </it>models</p>
               </caption>
               <tblbdy cols="3">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>Velvet assembly + mapping</p>
                     </c>
                     <c ca="center">
                        <p><it>G-Mo.R-Se </it>models</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="3">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Percentage of reference exonic nucleotides covered</p>
                     </c>
                     <c ca="center">
                        <p>24.8%</p>
                     </c>
                     <c ca="center">
                        <p>42.9%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Reference genes overlapped on &#8805; 1 nucleotide</p>
                     </c>
                     <c ca="center">
                        <p>12,270 (40.3%)</p>
                     </c>
                     <c ca="center">
                        <p>15,323 (50.3%)</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Reference genes overlapped on &#8805; 75% nucleotides</p>
                     </c>
                     <c ca="center">
                        <p>3,595 (11.8%)</p>
                     </c>
                     <c ca="center">
                        <p>9,306 (30.6%)</p>
                     </c>
                  </r>
               </tblbdy>
            </tbl>
         </sec>
         <sec>
            <st>
               <p>Comparing the <it>G-Mo.R-Se </it>approach to a classic cDNA sequencing approach</p>
            </st>
            <p>We compared the <it>G-Mo.R-Se </it>pipeline to a classic cDNA sequencing approach, using a reference set of 112,175 <it>V. vinifera </it>cDNA sequences from five tissues (including 87,199 multi-exonic cDNAs clustered in 7,895 loci) that were sequenced with the Sanger technology during the course of the <it>V. vinifera </it>genome sequencing and annotation project <abbrgrp><abbr bid="B44">44</abbr></abbrgrp> (Table <tblr tid="T3">3</tblr>).</p>
            <tbl id="T3">
               <title>
                  <p>Table 3</p>
               </title>
               <caption>
                  <p>Overlap of cDNA loci (all loci and loci where all 32-mers are unique) with <it>G-Mo.R-Se </it>models</p>
               </caption>
               <tblbdy cols="3">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>All cDNA clusters (7,895)</p>
                     </c>
                     <c ca="center">
                        <p>cDNAs clusters where all 32-mers are unique (4,822)</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="3">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Percentage of cDNA (exonic) nucleotides covered by models</p>
                     </c>
                     <c ca="center">
                        <p>76.0%</p>
                     </c>
                     <c ca="center">
                        <p>87.2%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>cDNA clusters overlapped on &#8805; 1 nucleotide by models</p>
                     </c>
                     <c ca="center">
                        <p>6,831 (87%)</p>
                     </c>
                     <c ca="center">
                        <p>4,581 (95%)</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>cDNA clusters overlapped on &#8805; 75% nucleotides by models</p>
                     </c>
                     <c ca="center">
                        <p>5,449 (69%)</p>
                     </c>
                     <c ca="center">
                        <p>3,997 (83%)</p>
                     </c>
                  </r>
               </tblbdy>
            </tbl>
            <p>The 46,062 <it>G-Mo.R-Se </it>models overlap about 70% of the 7,895 cDNA loci (on more than 75% of their nucleotides). The most obvious reason why about 15% of the cDNA loci are not overlapped by any model is that they correspond to repetitive DNA. We compared the proportion of unique 32-mers (in the whole <it>V. vinifera </it>genome) for the 5,449 cDNA loci well covered by models and the 1,064 cDNA loci uncovered by models. It appears that most of the cDNA loci that were missed by models are mainly constituted of non-unique 32-mers (Figure <figr fid="F3">3</figr>). When considering only the 4,822 loci where all the 32-mers are unique, 95% of the cDNA loci are hit by a model (Table <tblr tid="T3">3</tblr>). Among the 5% of cDNA loci that are missed, some are too poorly covered by reads for covtigs to be built and/or junctions to be validated, and others have reads in their introns, which create fused exons, preventing the models from being detected as spliced, since one large covtig spans the whole locus.</p>
            <fig id="F3">
               <title>
                  <p>Figure 3</p>
               </title>
               <caption>
                  <p>Proportion of unique 32-mers in cDNA clusters</p>
               </caption>
               <text>
                  <p><b>Proportion of unique 32-mers in cDNA clusters</b>. The percentage of unique 32-mers is shown for cDNA clusters overlapped by models on more than 75% of their nucleotides (green) and cDNA clusters not overlapped by models (red). The y-axis corresponds to the percentage of cDNA clusters in each bin (bin width is 10% of unique 32-mers among all 32-mers in the cluster).</p>
               </text>
               <graphic file="gb-2008-9-12-r175-3"/>
            </fig>
            <p>Interestingly, <it>G-Mo.R-Se </it>detects 2.5 times as many loci as the standard cDNA sequencing approach (19,486 loci versus 7,895). Among the 19,486 <it>G-Mo.R-Se </it>loci, only 36% overlap cDNA loci. We compared the characteristics of the 5,698 <it>G-Mo.R-Se </it>loci that overlap cDNAs on at least 50% of their nucleotides and the 12,392 loci that are outside cDNA loci (Figure <figr fid="F4">4</figr>). The <it>G-Mo.R-Se </it>loci that are new with respect to standard cDNAs tend to be expressed at lower levels than the loci that overlap cDNAs. These loci are investigated in more detail in the section 'Identifying novel genes and improving gene annotation'. The RNA-Seq technology, combined with <it>G-Mo.R-Se</it>, is able to detect gene expression that would be scored silent with a standard cDNA cloning and sequencing approach, or would necessitate an extensive Sanger sequencing effort.</p>
            <fig id="F4">
               <title>
                  <p>Figure 4</p>
               </title>
               <caption>
                  <p>Read coverage depth for models overlapping cDNA loci and models not overlapping cDNAs</p>
               </caption>
               <text>
                  <p><b>Read coverage depth for models overlapping cDNA loci and models not overlapping cDNAs</b>. The distribution of the average depth (log) on all exonic nucleotides of the models is plotted for models overlapping cDNAs on &#8805; 50% of their nucleotides (green) and models not overlapping cDNAs (black). The y-axis corresponds to the percentage of models in each bin (bin width is 0.2).</p>
               </text>
               <graphic file="gb-2008-9-12-r175-4"/>
            </fig>
            <p>On average, we annotated 2.4 models per locus. By removing the redundancy (structures fully included in other structures; see Materials and methods) from the cDNA sequences, we retained 9,827 representative sequences, with an average of 1.25 transcripts per locus. The models appear to be capturing more alternative splice forms than the cDNAs. However, as we build all possible models that correspond to the longest possible paths going from one covtig to another through validated junctions, some of the models probably do not correspond to real transcripts (for instance, if they link alternative exons that are incompatible, like models M<sub>3 </sub>and M<sub>4 </sub>in Figure <figr fid="F1">1</figr>). Since the long-range splice contiguity can not be inferred from short reads, we quantified short-range alternative splicing events in the models (all models, and only CDS portions of coding models) and in the cDNAs <abbrgrp><abbr bid="B46">46</abbr></abbrgrp> (Table <tblr tid="T4">4</tblr>).</p>
            <tbl id="T4">
               <title>
                  <p>Table 4</p>
               </title>
               <caption>
                  <p>Alternative splicing events detected in cDNAs, all <it>G-Mo.R-Se </it>models, and CDS portions of <it>G</it>-<it>Mo.R-Se </it>models</p>
               </caption>
               <tblbdy cols="6">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>cDNAs: 7,895 loci</p>
                     </c>
                     <c ca="center">
                        <p>Models (all): 19,486 loci</p>
                     </c>
                     <c ca="center">
                        <p>Models (CDS): 12,341 loci</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="1">
                        <hr/>
                     </c>
                     <c cspan="1">
                        <hr/>
                     </c>
                     <c cspan="1">
                        <hr/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>Number (%)</p>
                     </c>
                     <c ca="center">
                        <p>Number (%)</p>
                     </c>
                     <c ca="center">
                        <p>Number (%)</p>
                     </c>
                     <c ca="center">
                        <p>Events common to cDNAs and models</p>
                     </c>
                     <c ca="center">
                        <p>% of cDNA events</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="6">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Alternative acceptor/donor</p>
                     </c>
                     <c ca="center">
                        <p>690 (73.1%)</p>
                     </c>
                     <c ca="center">
                        <p>7,405 (62.5%)</p>
                     </c>
                     <c ca="center">
                        <p>2,988 (58.0%)</p>
                     </c>
                     <c ca="center">
                        <p>156</p>
                     </c>
                     <c ca="center">
                        <p>22.6</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Skipped</p>
                     </c>
                     <c ca="center">
                        <p>250 (26.5%)</p>
                     </c>
                     <c ca="center">
                        <p>3,656 (30.9%)</p>
                     </c>
                     <c ca="center">
                        <p>1,677 (32.5%)</p>
                     </c>
                     <c ca="center">
                        <p>18</p>
                     </c>
                     <c ca="center">
                        <p>7.2</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Mutually exclusive</p>
                     </c>
                     <c ca="center">
                        <p>4 (0.4%)</p>
                     </c>
                     <c ca="center">
                        <p>781 (6.6%)</p>
                     </c>
                     <c ca="center">
                        <p>487 (9.5%)</p>
                     </c>
                     <c ca="center">
                        <p>1</p>
                     </c>
                     <c ca="center">
                        <p>25.0</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Intron retention</p>
                     </c>
                     <c ca="center">
                        <p>1,227</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Total</p>
                     </c>
                     <c ca="center">
                        <p>2,171 (944 without IR)</p>
                     </c>
                     <c ca="center">
                        <p>11,842</p>
                     </c>
                     <c ca="center">
                        <p>5,152</p>
                     </c>
                     <c ca="center">
                        <p>175</p>
                     </c>
                     <c ca="center">
                        <p>18.5</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Total number of loci with alternative splicing (% of all identified loci)</p>
                     </c>
                     <c ca="center">
                        <p>783 (9.9%) (598 without IR)</p>
                     </c>
                     <c ca="center">
                        <p>1,602 (8.2%)</p>
                     </c>
                     <c ca="center">
                        <p>1,029 (8.3%)</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                  </r>
               </tblbdy>
            </tbl>
            <p>The <it>G-Mo.R-Se </it>pipeline does not allow the detection of intron retentions (IRs), since we do not currently test junctions inside covtigs: if the depth in the retained intron is greater than the threshold we used to build the covtigs, we will get only one splice variant containing the retained intron. It is likely that most of the exon fusions we detected by comparison with the cDNAs (Figure S1 in Additional data file 1) correspond to cases of IRs. However, we were able to detect alternative donors or acceptors, skipped exons, and mutually exclusive exons. The relative abundance of these different classes of events is similar in the models and the cDNAs (from the most prevalent to the least prevalent: alternative acceptors/donors, skipped exons, mutually exclusive exons), but the total number of alternative splicing events in models (11,842 in all models, 5,152 in CDS portions) is much higher than in cDNAs (944 events, when removing the 1,227 IRs). The splice forms expressed at low levels, which could not be detected with cDNA cloning and Sanger sequencing, appear to harbor an unexpected number of alternative splicing events. It is likely that all these events are not compatible with the coding capacity of the transcripts. However, when restraining the analysis to the coding portions of models with plausible CDSs (that is, likely to be correctly predicted), the number of alternative splicing events remains higher than for cDNAs and the proportions of the different types of events remain unchanged. As an example, Figure <figr fid="F5">5</figr> shows a locus where three alternative coding models were predicted: two of them (M<sub>2 </sub>and M<sub>3</sub>) are already supported by EST evidence, but the third model (M<sub>1</sub>) corresponds to a novel alternative splice form. Although the number of alternative splicing events is higher in the RNA-Seq dataset than in the cDNA dataset, the proportion of loci where alternative splicing occurs is similar for cDNA clusters and <it>G-Mo.R-Se </it>models (10% and 8%, respectively). These results are in agreement with previous studies that showed that the fraction of alternatively spliced genes is lower in plants than in animals <abbrgrp><abbr bid="B47">47</abbr></abbrgrp>. Notably, of the 944 non-IR events detected in cDNAs, the models detect only 175 (18.5%): though some of these events might result from incorrect mapping of the cDNAs, most of them are likely to be real, and to have been missed by <it>G-Mo.R-Se </it>(Table <tblr tid="T5">5</tblr>). The pipeline detected only 7.2% of the skipped exons and 25% of the mutually exclusive exons, which is likely due to the limited number of neighboring covtigs (20) we tested to validate the junctions. Only 22.6% of the alternative donors/acceptors were detected because we searched for junctions only 100 nucleotides around the covtig boundaries, which limited the window where alternative splice sites could be discovered (see Materials and methods). Obviously, the model construction was not designed to capture the whole alternative splicing landscape of a genome. But still, the non-exhaustive view that we obtain is much richer than what could have been suspected from classic EST sequencing. In order to study alternative splicing exhaustively, which is out of the scope of this study, specific tools will need to be developed.</p>
            <fig id="F5">
               <title>
                  <p>Figure 5</p>
               </title>
               <caption>
                  <p>Example of alternatively spliced models built from short reads</p>
               </caption>
               <text>
                  <p><b>Example of alternatively spliced models built from short reads</b>. The figure shows a capture of a 4 kb genomic region from <it>V. vinifera </it>chromosome 12 between 3,836,500 bp and 3,840,500 bp. The first track (Genoscope annotations) contains the automatic annotation from <abbrgrp><abbr bid="B44">44</abbr></abbrgrp>. The green models are GeneWise alignments of Uniprot proteins. Alignment of <it>V. vinifera </it>cDNAs from <abbrgrp><abbr bid="B44">44</abbr></abbrgrp> are in red, and public <it>V. vinifera </it>ESTs are in light green. The next track displays the models predicted by <it>G-Mo.R-Se </it>(untranslated region in grey, CDS in red). Initial covtigs are displayed as brown boxes (average depth of covtigs is written below each covtig). Alignments of velvet contigs are displayed in purple. <it>Ab initio </it>models produced by geneID <abbrgrp><abbr bid="B51">51</abbr></abbrgrp> and SNAP <abbrgrp><abbr bid="B52">52</abbr></abbrgrp> are displayed in blue and pink, respectively. The short reads coverage depth is plotted on the last track (black): the dashed red line shows the threshold used to build covtigs. Model M<sub>2 </sub>is confirmed by numerous resources, model M<sub>3 </sub>seems to be a minor alternative splice form (it is only supported by two public ESTs: E<sub>1 </sub>and E<sub>2</sub>), and model M<sub>1 </sub>is a novel alternative splice form.</p>
               </text>
               <graphic file="gb-2008-9-12-r175-5"/>
            </fig>
            <tbl id="T5">
               <title>
                  <p>Table 5</p>
               </title>
               <caption>
                  <p>Characteristics of known and novel <it>G-Mo.R-Se </it>models (all, and with a plausible CDS)</p>
               </caption>
               <tblbdy cols="5">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Known model loci</p>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Novel model loci</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>All models</p>
                     </c>
                     <c ca="center">
                        <p>Models with a plausible CDS (65%)</p>
                     </c>
                     <c ca="center">
                        <p>All models</p>
                     </c>
                     <c ca="center">
                        <p>Models with a plausible CDS (17%)</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="5">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Number of loci</p>
                     </c>
                     <c ca="center">
                        <p>18,811</p>
                     </c>
                     <c ca="center">
                        <p>12,236</p>
                     </c>
                     <c ca="center">
                        <p>675</p>
                     </c>
                     <c ca="center">
                        <p>105</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Number of models</p>
                     </c>
                     <c ca="center">
                        <p>45,290</p>
                     </c>
                     <c ca="center">
                        <p>28,283</p>
                     </c>
                     <c ca="center">
                        <p>772</p>
                     </c>
                     <c ca="center">
                        <p>116</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Average number of models per locus</p>
                     </c>
                     <c ca="center">
                        <p>2.4</p>
                     </c>
                     <c ca="center">
                        <p>2.3</p>
                     </c>
                     <c ca="center">
                        <p>1.1</p>
                     </c>
                     <c ca="center">
                        <p>1.1</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Average number of exons per model</p>
                     </c>
                     <c ca="center">
                        <p>8.2</p>
                     </c>
                     <c ca="center">
                        <p>8.9</p>
                     </c>
                     <c ca="center">
                        <p>2.3</p>
                     </c>
                     <c ca="center">
                        <p>2.9</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Number of models with more than two exons</p>
                     </c>
                     <c ca="center">
                        <p>37,644 (83%)</p>
                     </c>
                     <c ca="center">
                        <p>25,428 (90%)</p>
                     </c>
                     <c ca="center">
                        <p>128 (17%)</p>
                     </c>
                     <c ca="center">
                        <p>56 (53%)</p>
                     </c>
                  </r>
               </tblbdy>
               <tblfn>
                  <p>Models were clustered in loci as described in Materials and methods.</p>
               </tblfn>
            </tbl>
         </sec>
         <sec>
            <st>
               <p>Identifying novel genes and improving gene annotation</p>
            </st>
            <p>Expectedly, since <it>V. vinifera </it>belongs to a phylogenetic branch where a profusion of resources are available, most of the models (95%) that fall outside of cDNAs overlap the reference annotation <abbrgrp><abbr bid="B44">44</abbr></abbrgrp>, or other resources such as GeneWise hits with Uniprot proteins <abbrgrp><abbr bid="B39">39</abbr><abbr bid="B49">49</abbr></abbrgrp>, and ESTs from other species (Table S2 in Additional data file 1). However, 675 models are completely novel, or 116 when considering only models with a plausible CDS. We compared the characteristics of the models that are novel and the models that are supported by evidence, which we now call 'known' models (Table <tblr tid="T5">5</tblr>).</p>
            <p>The proportion of models with a plausible CDS drops when considering the novel models compared to the known models (from 65% to 17%), as well as the average number of exons per model (from 8.2 to 2.3 for all models). It is likely that some of the novel models correspond to false predictions: if one junction is validated erroneously, it will create a false two-exon model. Nevertheless, the proportion of models with more than two exons is higher in the subset of novel models having plausible CDSs (53%) compared to all novel models (17%), which suggests that at least some of them are genuine novel coding loci. In addition, the novel loci that are non-coding could either correspond to coding transcripts that were mis-annotated by the pipeline (wrong splice site generating a frameshift, models associating incompatible exons), to coding transcripts where no CDS could be detected because of frameshifts in the genomic sequence, to genuine non-coding transcripts, or to transcriptional/experimental noise (Figure <figr fid="F1">1</figr>). The structure of one of the novel models, spanning eight exons, is shown in Figure S2 in Additional data file 1. A Blast <abbrgrp><abbr bid="B48">48</abbr></abbrgrp> search against Uniprot <abbrgrp><abbr bid="B49">49</abbr></abbrgrp> revealed an homology to a transcription regulator from <it>Arabidopsis thaliana</it>. The homology was below the sensitivity threshold required to map proteins to the genome during the annotation process. In addition to the discovery of novel splice forms and novel loci, <it>G-Mo.R-Se </it>models enrich the reference annotation by extending (in 5' or 3') about 40% of the reference genes they hit. <it>G-Mo.R-Se </it>models thus constitute a valuable resource for improving <it>V. vinifera </it>gene annotation.</p>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Conclusion</p>
         </st>
         <p>In this study, we demonstrate the feasibility of building gene models <it>de novo</it>, using only RNA-Seq reads and the corresponding genomic sequence, with a relatively straightforward annotation pipeline that we call <it>G-Mo.R-Se</it>. Using a dataset of approximately 175 million Solexa reads, it could detect more loci than could be identified by cloning and sequencing approximately 120,000 cDNAs, at a cost about 20 times lower (55% of the multi-exonic genes from the annotation are overlapped by models versus only 35% by <it>V. vinifera </it>cDNAs). Especially, <it>G-Mo.R-Se </it>allowed the annotation of loci expressed at very low levels. We show that this approach efficiently deciphers real transcripts from transcriptional/experimental noise since the junction validation step removes false positive covtigs. Additionally, although it was not designed to be exhaustive in the detection of alternative splicing events, <it>G-Mo.R-Se </it>detected more alternative splice forms than the cDNA resource, with no need for <it>a priori </it>knowledge of the exon-exon junctions to test. Finally, we could also identify putative novel genes (that had been missed by the automatic annotation procedure) in a genome that is already very well annotated owing to the plethora of resources available in this phylum. We tested the <it>G-Mo.R-Se </it>pipeline with Solexa/Illumina RNA-Seq reads but it can readily accept any other type of short reads, or combine reads from different technologies.</p>
         <p>For future genome projects, it is conceivable to think of performing the annotation using RNA-Seq runs treated with <it>G-Mo.R-Se </it>as the unique resource, provided that the tissues or cell types sampled are representative enough to drive a comprehensive annotation. This approach will be particularly valuable in phyla where few resources are available (that is, that are very distant from the species currently present in the EST/protein databases), where the expensive and time-consuming step of constructing cDNA libraries could be avoided. When other resources are available, the gene models can also be combined with other data into automatic or manual eukaryotic genome annotation pipelines.</p>
         <p>Although the <it>G-Mo.R-Se </it>pipeline works satisfactorily on the <it>V. vinifera </it>dataset, it is still fairly simple and we can think of several refinements. First, at the moment, no mono-exonic models are produced (such models represent only 8% of annotated grape genes), but we could easily bring back the covtigs that were not linked to any other covtig by a validated junction, if they contain a CDS that exceeds a certain length. Next, at the covtig building step, instead of using a fixed depth threshold, we could adapt it to the environment: the covtigs would be built to coincide with sharp increases/decreases in depth. Such a strategy should enable the annotation of separate exons in case of IR. In order to correct even more fusions, it would also be straightforward to test candidate junctions inside the covtigs in addition to the junctions tested between covtigs. Since the scope of this study was to annotate as many genes as possible, we chose to pool together the reads from all four tissues before building the covtigs. But we could also consider building covtigs and gene models separately in different samples, in order to investigate differential expression, although to the detriment of sensitivity. A last, more elaborate refinement would be to use the depth information in order to link together only covtigs that are likely to be part of the same transcript, instead of building all models that correspond to the longest possible paths in the graph of covtigs linked by validated junctions. Such an approach would allow speculation on longer range splice contiguity, and to study more exhaustively the alternative splicing landscape.</p>
      </sec>
      <sec>
         <st>
            <p>Materials and methods</p>
         </st>
         <sec>
            <st>
               <p>RNA-Seq experiments</p>
            </st>
            <p>RNA-Seq reads were obtained (as described in Del Fabbro <it>et al</it>., unpublished data) by sequencing cDNA obtained from four tissue samples with the Solexa/Illumina technology: leaf (11 lanes), root (9 lanes), callus (9 lanes), and stem (9 lanes). The mRNA molecules were purified from total RNA extractions and fragmented before cDNA synthesis (with random hexamer primers). The protocol was not strand-specific. The single-end reads obtained were 32 nucleotides long, except for 5 lanes in the callus sample, where the reads were 35 nucleotides long. The resulting 172,545,778 usable reads (5.4 Gbases) were mapped to the <it>V. vinifera </it>genome <abbrgrp><abbr bid="B44">44</abbr></abbrgrp> using SOAP <abbrgrp><abbr bid="B8">8</abbr></abbrgrp> with a seed length of 12 and default parameters: 138,326,238 reads (4.6 Gbases) were mapped at one unique position with at most two mismatches and no indels. As a consequence, reads that align to exon-exon junctions could not be mapped to the genomic sequence.</p>
         </sec>
         <sec>
            <st>
               <p>Building gene models from short reads</p>
            </st>
            <p>The <it>G-Mo.R-Se </it>method for building gene models from short reads is summarized in Figure <figr fid="F1">1</figr>. The first step is the definition of covtigs (coverage contigs). They are built by contiging the positions where short reads are aligned above a certain coverage depth threshold. This threshold is a parameter that needs to be adjusted in order to balance sensitivity and specificity as well as splits and fusions. In the absence of a training set to quantify the splits and fusions, this parameter can also be optimized by maximizing the number of junctions validated in the next step. Before the subsequent testing of junctions, the covtigs were extended using all 16-mers found in short reads, in order to step over mismatches and short repeats. It is important to note that the read length limits the detection of very short exons (&lt; 35 nucleotides).</p>
            <p>In the next step, we searched for donor (GT or GC on the forward strand, and AG or AC on the reverse strand) and acceptor (AG on forward strand and CT on reverse strand) splice sites 100 nucleotides inside and outside each covtig boundary. This enabled us to create a list of oriented candidate exons (with putative alternative donor and/or acceptor splice sites) for each covtig.</p>
            <p>The third step was the validation of junctions between candidate exons using unmapped reads, since reads that align to exon-exon junctions were not mapped to the genomic sequence. We tested all candidate exons derived from a given covtig with the candidate exons derived from the 20 next covtigs. All the putative junctions were tested using a word dictionary approach. The dictionary (with a word size of 25) was built using the unmapped reads. Ten words (8 nucleotides on the first exon and 17 nucleotides on the second exon, 9/16, 10/15, 11/14, 12/13, 13/12, 14/11, 15/10, 16/9, 17/8) were derived from each putative junction, and their presence in the dictionary was tested. In order to validate a junction, at least five different words need to be found in the dictionary, and the total number of occurrences of all words derived from each junction needs to be of the same order of magnitude as the average depth of the adjacent covtigs (greater than 1/10 of their average depth).</p>
            <p>The efficiency of the junction validation procedure relies on the covtig definition step for the following reasons: only the junctions between each covtig and the 20 next covtigs are tested, meaning that if more than 20 'false' covtigs are defined between 2 'real' covtigs, the junction between the two real covtigs will not be tested; only 100 nucleotides around the covtig boundary are scanned for putative splice sites, meaning that if the covtigs are too short or too long, the correct junction will not be tested; only junctions between covtigs are tested, meaning that if a covtig corresponds to a fusion between two exons, the correct junction will not be tested, and the final model will include a retained intron. On the other hand, if an exon is split between two covtigs, no junction will be valid between those covtigs, leading to the splitting of a gene into separate models. As a consequence, in the absence of a training set (annotated genes, ESTs, and so on) to calibrate the depth threshold used for building covtigs, it is possible to optimize the threshold by maximizing the number of validated junctions. <it>G-Mo.R-Se </it>can thus be used for <it>de novo </it>annotation.</p>
            <p>For the last step, the model construction relies on the graph of candidate exons linked by validated junctions on the same strand. The models correspond to all the longest paths linking candidate exons through validated junctions. Candidate exons that are not involved in any validated junction are discarded, implying that no mono-exonic models are produced. In order to correct potential gene splits, we fuse together adjacent models (on the same strand) that are linked by an open reading frame.</p>
            <p>Additionally, all models produced by <it>G-Mo.R-Se </it>are searched for CDSs. When the longest CDS (if greater than 50 amino acids) spans at least two-thirds of the nucleotides of a model or the number of non-coding exons is lower than the number of coding exons, the CDS is qualified as plausible. Models with plausible CDSs are likely to correspond to protein coding genes. Plausible CDSs could be detected for about two-thirds of the models. The <it>G-Mo.R-Se </it>models can be downloaded from the <it>G-Mo.R-Se </it>website <abbrgrp><abbr bid="B45">45</abbr></abbrgrp> and visualized on the <it>V. vinifera </it>genome browser <abbrgrp><abbr bid="B50">50</abbr></abbrgrp>.</p>
         </sec>
         <sec>
            <st>
               <p><it>G-Mo.R-Se </it>models and cDNA analysis (clustering, alternative splicing detection)</p>
            </st>
            <p>The same clustering procedure was applied to models and cDNA sequences aligned on the genome. We used a single linkage clustering approach, where a link between two models was created if they had a cumulated exonic overlap (on the same strand) of at least 100 nucleotides (only overlaps of at least 10 nucleotides were considered). A graph-based approach was used to resolve the single linkage clustering. Additionally, the redundancy was removed from the cDNAs by discarding all transcript structures that were fully included in longer structures. We detected all pairwise alternative splicing events between intron pairs, with the same method as described in <abbrgrp><abbr bid="B46">46</abbr></abbrgrp>. All tandemly duplicated genes were discarded from the alternative splicing events detected, since such genes may be artificially linked by cDNA mapping as well as model construction, and would generate false alternative splice forms spanning several loci instead of one. However, it is notable that, since the pipeline builds all possible models, it will always predict the two separate correct models in addition to the incorrect joined model(s).</p>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Abbreviations</p>
         </st>
         <p>CDS: coding sequence; EST: expressed sequence tag; <it>G-Mo.R-Se</it>: Gene Modelling using RNA-Seq; IR: intron retention; SOAP: Short Oligonucleotide Analysis Package.</p>
      </sec>
      <sec>
         <st>
            <p>Authors' contributions</p>
         </st>
         <p>FD performed preliminary tests, ran the pipeline and analyzed the results. JMA had the original idea of the algorithm. CDS performed the mapping of the cDNAs and analyzed alternative splicing events. BN produced the annotation of the grapevine genome. OR developed components of the cDNA mapping pipeline. RNA-Seq data were generated and provided thanks to MD, MM and GV. PW and CS produced genomics and cDNA data and assisted in data management. OJ took care of the coordination with the <it>V. vinifera </it>consortium, and contributed to the writing of the paper. FA assisted in the design of the pipeline and manuscript preparation. FD and JMA developed the current version of the software and wrote the paper. All authors read and approved the final manuscript.</p>
      </sec>
      <sec>
         <st>
            <p>Additional data files</p>
         </st>
         <p>The following additional data are available with the online version of the paper. Additional data file <supplr sid="S1">1</supplr> is a Word file containing Tables S1 and S2 and Figures S1 and S2. Table S1: cDNA transcript structures correctly predicted by <it>G-Mo.R-Se </it>and Velvet. Table S2: support (in public resources) of <it>G-Mo.R-Se </it>models that do not overlap cDNAs. Figure S1: proportions of exon fusions and exon splits obtained with different depth thresholds for the covtig construction step. Figure S2: example of a novel model.</p>
         <suppl id="S1">
            <title>
               <p>Additional data file 1</p>
            </title>
            <caption>
               <p>Tables S1 and S2 and Figures S1 and S2</p>
            </caption>
            <text>
               <p>Table S1: cDNA transcript structures correctly predicted by <it>G-Mo.R-Se </it>and Velvet. Table S2: support (in public resources) of <it>G-Mo.R-Se </it>models that do not overlap cDNAs. Figure S1: proportions of exon fusions and exon splits obtained with different depth thresholds for the covtig construction step. Figure S2: example of a novel model.</p>
            </text>
            <file name="gb-2008-9-12-r175-S1.doc">
               <p>Click here for file</p>
            </file>
         </suppl>
      </sec>
   </bdy>
   <bm>
      <ack>
         <sec>
            <st>
               <p>Acknowledgements</p>
            </st>
            <p>This work was financially supported by the Genoscope, Institut de G&#233;nomique, CEA and Agence Nationale de la Recherche (ANR). The authors acknowledge Susan Cure for correcting the manuscript and Jean Weissenbach for continuous support.</p>
         </sec>
      </ack>
      <refgrp>
         <bibl id="B1">
            <title>
               <p>The new paradigm of flow cell sequencing.</p>
            </title>
            <aug>
               <au>
                  <snm>Holt</snm>
                  <fnm>RA</fnm>
               </au>
               <au>
                  <snm>Jones</snm>
                  <fnm>SJ</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>839</fpage>
            <lpage>846</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1101/gr.073262.107</pubid>
                  <pubid idtype="pmpid" link="fulltext">18519653</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B2">
            <title>
               <p>Next-generation DNA sequencing methods.</p>
            </title>
            <aug>
               <au>
                  <snm>Mardis</snm>
                  <fnm>ER</fnm>
               </au>
            </aug>
            <source>Annu Rev Genomics Hum Genet</source>
            <pubdate>2008</pubdate>
            <volume>9</volume>
            <fpage>387</fpage>
            <lpage>402</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1146/annurev.genom.9.081307.164359</pubid>
                  <pubid idtype="pmpid" link="fulltext">18576944</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B3">
            <title>
               <p>Systematic prediction and validation of breakpoints associated with copy-number variants in the human genome.</p>
            </title>
            <aug>
               <au>
                  <snm>Korbel</snm>
                  <fnm>JO</fnm>
               </au>
               <au>
                  <snm>Urban</snm>
                  <fnm>AE</fnm>
               </au>
               <au>
                  <snm>Grubert</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Du</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Royce</snm>
                  <fnm>TE</fnm>
               </au>
               <au>
                  <snm>Starr</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Zhong</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Emanuel</snm>
                  <fnm>BS</fnm>
               </au>
               <au>
                  <snm>Weissman</snm>
                  <fnm>SM</fnm>
               </au>
               <au>
                  <snm>Snyder</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Gerstein</snm>
                  <fnm>MB</fnm>
               </au>
            </aug>
            <source>Proc Natl Acad Sci USA</source>
            <pubdate>2007</pubdate>
            <volume>104</volume>
            <fpage>10110</fpage>
            <lpage>10115</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1891248</pubid>
                  <pubid idtype="pmpid" link="fulltext">17551006</pubid>
                  <pubid idtype="doi">10.1073/pnas.0703834104</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B4">
            <title>
               <p>Mapping translocation breakpoints by next-generation sequencing.</p>
            </title>
            <aug>
               <au>
                  <snm>Chen</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Kalscheuer</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Tzschach</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Menzel</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Ullmann</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Schulz</snm>
                  <fnm>MH</fnm>
               </au>
               <au>
                  <snm>Erdogan</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Li</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Kijas</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Arkesteijn</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Pajares</snm>
                  <fnm>IL</fnm>
               </au>
               <au>
                  <snm>Goetz-Sothmann</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Heinrich</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Rost</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Dufke</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Grasshoff</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Glaeser</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Vingron</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Ropers</snm>
                  <fnm>HH</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>1143</fpage>
            <lpage>1149</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2493403</pubid>
                  <pubid idtype="pmpid" link="fulltext">18326688</pubid>
                  <pubid idtype="doi">10.1101/gr.076166.108</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B5">
            <title>
               <p>Shotgun bisulphite sequencing of the <it>Arabidopsis </it>genome reveals DNA methylation patterning.</p>
            </title>
            <aug>
               <au>
                  <snm>Cokus</snm>
                  <fnm>SJ</fnm>
               </au>
               <au>
                  <snm>Feng</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Chen</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Merriman</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Haudenschild</snm>
                  <fnm>CD</fnm>
               </au>
               <au>
                  <snm>Pradhan</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Nelson</snm>
                  <fnm>SF</fnm>
               </au>
               <au>
                  <snm>Pellegrini</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Jacobsen</snm>
                  <fnm>SE</fnm>
               </au>
            </aug>
            <source>Nature</source>
            <pubdate>2008</pubdate>
            <volume>452</volume>
            <fpage>215</fpage>
            <lpage>219</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2377394</pubid>
                  <pubid idtype="pmpid" link="fulltext">18278030</pubid>
                  <pubid idtype="doi">10.1038/nature06745</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B6">
            <title>
               <p>Global analysis of in vivo Foxa2-binding sites in mouse adult liver using massively parallel sequencing.</p>
            </title>
            <aug>
               <au>
                  <snm>Wederell</snm>
                  <fnm>ED</fnm>
               </au>
               <au>
                  <snm>Bilenky</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Cullum</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Thiessen</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Dagpinar</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Delaney</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Varhol</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Zhao</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Zeng</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Bernier</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Ingham</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Hirst</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Robertson</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Marra</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Jones</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Hoodless</snm>
                  <fnm>PA</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2008</pubdate>
            <volume>36</volume>
            <fpage>4549</fpage>
            <lpage>4564</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2504304</pubid>
                  <pubid idtype="pmpid" link="fulltext">18611952</pubid>
                  <pubid idtype="doi">10.1093/nar/gkn382</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B7">
            <title>
               <p>Genome-wide maps of chromatin state in pluripotent and lineage-committed cells.</p>
            </title>
            <aug>
               <au>
                  <snm>Mikkelsen</snm>
                  <fnm>TS</fnm>
               </au>
               <au>
                  <snm>Ku</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Jaffe</snm>
                  <fnm>DB</fnm>
               </au>
               <au>
                  <snm>Issac</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Lieberman</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Giannoukos</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Alvarez</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Brockman</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Kim</snm>
                  <fnm>TK</fnm>
               </au>
               <au>
                  <snm>Koche</snm>
                  <fnm>RP</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Mendenhall</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>O'Donovan</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Presser</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Russ</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Xie</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Meissner</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Wernig</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Jaenisch</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Nusbaum</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Lander</snm>
                  <fnm>ES</fnm>
               </au>
               <au>
                  <snm>Bernstein</snm>
                  <fnm>BE</fnm>
               </au>
            </aug>
            <source>Nature</source>
            <pubdate>2007</pubdate>
            <volume>448</volume>
            <fpage>553</fpage>
            <lpage>560</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nature06008</pubid>
                  <pubid idtype="pmpid" link="fulltext">17603471</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B8">
            <title>
               <p>SOAP: short oligonucleotide alignment program.</p>
            </title>
            <aug>
               <au>
                  <snm>Li</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Li</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Kristiansen</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Wang</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2008</pubdate>
            <volume>24</volume>
            <fpage>713</fpage>
            <lpage>714</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btn025</pubid>
                  <pubid idtype="pmpid" link="fulltext">18227114</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B9">
            <title>
               <p>Whole-genome sequencing and variant discovery in <it>C. elegans</it>.</p>
            </title>
            <aug>
               <au>
                  <snm>Hillier</snm>
                  <fnm>LW</fnm>
               </au>
               <au>
                  <snm>Marth</snm>
                  <fnm>GT</fnm>
               </au>
               <au>
                  <snm>Quinlan</snm>
                  <fnm>AR</fnm>
               </au>
               <au>
                  <snm>Dooling</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Fewell</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Barnett</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Fox</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Glasscock</snm>
                  <fnm>JI</fnm>
               </au>
               <au>
                  <snm>Hickenbotham</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Huang</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Magrini</snm>
                  <fnm>VJ</fnm>
               </au>
               <au>
                  <snm>Richt</snm>
                  <fnm>RJ</fnm>
               </au>
               <au>
                  <snm>Sander</snm>
                  <fnm>SN</fnm>
               </au>
               <au>
                  <snm>Stewart</snm>
                  <fnm>DA</fnm>
               </au>
               <au>
                  <snm>Stromberg</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Tsung</snm>
                  <fnm>EF</fnm>
               </au>
               <au>
                  <snm>Wylie</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Schedl</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Wilson</snm>
                  <fnm>RK</fnm>
               </au>
               <au>
                  <snm>Mardis</snm>
                  <fnm>ER</fnm>
               </au>
            </aug>
            <source>Nat Methods</source>
            <pubdate>2008</pubdate>
            <volume>5</volume>
            <fpage>183</fpage>
            <lpage>188</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nmeth.1179</pubid>
                  <pubid idtype="pmpid" link="fulltext">18204455</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B10">
            <title>
               <p>ZOOM! Zillions Of Oligos Mapped.</p>
            </title>
            <aug>
               <au>
                  <snm>Lin</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>MQ</fnm>
               </au>
               <au>
                  <snm>Ma</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Li</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2008</pubdate>
            <volume>24</volume>
            <fpage>2431</fpage>
            <lpage>2437</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btn416</pubid>
                  <pubid idtype="pmpid" link="fulltext">18684737</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B11">
            <title>
               <p>Mapping short DNA sequencing reads and calling variants using mapping quality scores.</p>
            </title>
            <aug>
               <au>
                  <snm>Li</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Ruan</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Durbin</snm>
                  <fnm>R</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>1851</fpage>
            <lpage>1858</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2577856</pubid>
                  <pubid idtype="pmpid" link="fulltext">18714091</pubid>
                  <pubid idtype="doi">10.1101/gr.078212.108</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B12">
            <title>
               <p>Assembling millions of short DNA sequences using SSAKE.</p>
            </title>
            <aug>
               <au>
                  <snm>Warren</snm>
                  <fnm>RL</fnm>
               </au>
               <au>
                  <snm>Sutton</snm>
                  <fnm>GG</fnm>
               </au>
               <au>
                  <snm>Jones</snm>
                  <fnm>SJ</fnm>
               </au>
               <au>
                  <snm>Holt</snm>
                  <fnm>RA</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2007</pubdate>
            <volume>23</volume>
            <fpage>500</fpage>
            <lpage>501</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btl629</pubid>
                  <pubid idtype="pmpid" link="fulltext">17158514</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B13">
            <title>
               <p>ALLPATHS: <it>de novo </it>assembly of whole-genome shotgun microreads.</p>
            </title>
            <aug>
               <au>
                  <snm>Butler</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>MacCallum</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Kleber</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Shlyakhter</snm>
                  <fnm>IA</fnm>
               </au>
               <au>
                  <snm>Belmonte</snm>
                  <fnm>MK</fnm>
               </au>
               <au>
                  <snm>Lander</snm>
                  <fnm>ES</fnm>
               </au>
               <au>
                  <snm>Nusbaum</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Jaffe</snm>
                  <fnm>DB</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>810</fpage>
            <lpage>820</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2336810</pubid>
                  <pubid idtype="pmpid" link="fulltext">18340039</pubid>
                  <pubid idtype="doi">10.1101/gr.7337908</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B14">
            <title>
               <p>Velvet: algorithms for <it>de novo </it>short read assembly using de Bruijn graphs.</p>
            </title>
            <aug>
               <au>
                  <snm>Zerbino</snm>
                  <fnm>DR</fnm>
               </au>
               <au>
                  <snm>Birney</snm>
                  <fnm>E</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>821</fpage>
            <lpage>829</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2336801</pubid>
                  <pubid idtype="pmpid" link="fulltext">18349386</pubid>
                  <pubid idtype="doi">10.1101/gr.074492.107</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B15">
            <title>
               <p>SHARCGS, a fast and highly accurate short-read assembly algorithm for <it>de novo </it>genomic sequencing.</p>
            </title>
            <aug>
               <au>
                  <snm>Dohm</snm>
                  <fnm>JC</fnm>
               </au>
               <au>
                  <snm>Lottaz</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Borodina</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Himmelbauer</snm>
                  <fnm>H</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2007</pubdate>
            <volume>17</volume>
            <fpage>1697</fpage>
            <lpage>1706</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2045152</pubid>
                  <pubid idtype="pmpid" link="fulltext">17908823</pubid>
                  <pubid idtype="doi">10.1101/gr.6435207</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B16">
            <title>
               <p>Whole-genome sequencing and assembly with high-throughput, short-read technologies.</p>
            </title>
            <aug>
               <au>
                  <snm>Sundquist</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Ronaghi</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Tang</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Pevzner</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Batzoglou</snm>
                  <fnm>S</fnm>
               </au>
            </aug>
            <source>PLoS ONE</source>
            <pubdate>2007</pubdate>
            <volume>2</volume>
            <fpage>e484</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1871613</pubid>
                  <pubid idtype="pmpid" link="fulltext">17534434</pubid>
                  <pubid idtype="doi">10.1371/journal.pone.0000484</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B17">
            <title>
               <p>A microRNA catalog of the developing chicken embryo identified by a deep sequencing approach.</p>
            </title>
            <aug>
               <au>
                  <snm>Glazov</snm>
                  <fnm>EA</fnm>
               </au>
               <au>
                  <snm>Cottee</snm>
                  <fnm>PA</fnm>
               </au>
               <au>
                  <snm>Barris</snm>
                  <fnm>WC</fnm>
               </au>
               <au>
                  <snm>Moore</snm>
                  <fnm>RJ</fnm>
               </au>
               <au>
                  <snm>Dalrymple</snm>
                  <fnm>BP</fnm>
               </au>
               <au>
                  <snm>Tizard</snm>
                  <fnm>ML</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>957</fpage>
            <lpage>964</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2413163</pubid>
                  <pubid idtype="pmpid" link="fulltext">18469162</pubid>
                  <pubid idtype="doi">10.1101/gr.074740.107</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B18">
            <title>
               <p>Deep sequencing of tomato short RNAs identifies microRNAs targeting genes involved in fruit ripening.</p>
            </title>
            <aug>
               <au>
                  <snm>Moxon</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Jing</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Szittya</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Schwach</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Rusholme Pilcher</snm>
                  <fnm>RL</fnm>
               </au>
               <au>
                  <snm>Moulton</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Dalmay</snm>
                  <fnm>T</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>1602</fpage>
            <lpage>1609</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1101/gr.080127.108</pubid>
                  <pubid idtype="pmpid" link="fulltext">18653800</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B19">
            <title>
               <p>MicroRNA discovery and profiling in human embryonic stem cells by deep sequencing of small RNA libraries.</p>
            </title>
            <aug>
               <au>
                  <snm>Bar</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Wyman</snm>
                  <fnm>SK</fnm>
               </au>
               <au>
                  <snm>Fritz</snm>
                  <fnm>BR</fnm>
               </au>
               <au>
                  <snm>Qi</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Garg</snm>
                  <fnm>KS</fnm>
               </au>
               <au>
                  <snm>Parkin</snm>
                  <fnm>RK</fnm>
               </au>
               <au>
                  <snm>Kroh</snm>
                  <fnm>EM</fnm>
               </au>
               <au>
                  <snm>Bendoraite</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Mitchell</snm>
                  <fnm>PS</fnm>
               </au>
               <au>
                  <snm>Nelson</snm>
                  <fnm>AM</fnm>
               </au>
               <au>
                  <snm>Ruzzo</snm>
                  <fnm>WL</fnm>
               </au>
               <au>
                  <snm>Ware</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Radich</snm>
                  <fnm>JP</fnm>
               </au>
               <au>
                  <snm>Gentleman</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Ruohola-Baker</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Tewari</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Stem Cells</source>
            <pubdate>2008</pubdate>
            <volume>26</volume>
            <fpage>2496</fpage>
            <lpage>2505</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1634/stemcells.2008-0356</pubid>
                  <pubid idtype="pmpid" link="fulltext">18583537</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B20">
            <title>
               <p>Highly integrated single-base resolution maps of the epigenome in <it>Arabidopsis</it>.</p>
            </title>
            <aug>
               <au>
                  <snm>Lister</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>O'Malley</snm>
                  <fnm>RC</fnm>
               </au>
               <au>
                  <snm>Tonti-Filippini</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Gregory</snm>
                  <fnm>BD</fnm>
               </au>
               <au>
                  <snm>Berry</snm>
                  <fnm>CC</fnm>
               </au>
               <au>
                  <snm>Millar</snm>
                  <fnm>AH</fnm>
               </au>
               <au>
                  <snm>Ecker</snm>
                  <fnm>JR</fnm>
               </au>
            </aug>
            <source>Cell</source>
            <pubdate>2008</pubdate>
            <volume>133</volume>
            <fpage>523</fpage>
            <lpage>536</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1016/j.cell.2008.03.029</pubid>
                  <pubid idtype="pmpid" link="fulltext">18423832</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B21">
            <title>
               <p>Dynamic repertoire of a eukaryotic transcriptome surveyed at single-nucleotide resolution.</p>
            </title>
            <aug>
               <au>
                  <snm>Wilhelm</snm>
                  <fnm>BT</fnm>
               </au>
               <au>
                  <snm>Marguerat</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Watt</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Schubert</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Wood</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Goodhead</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Penkett</snm>
                  <fnm>CJ</fnm>
               </au>
               <au>
                  <snm>Rogers</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Bahler</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>Nature</source>
            <pubdate>2008</pubdate>
            <volume>453</volume>
            <fpage>1239</fpage>
            <lpage>1243</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nature07002</pubid>
                  <pubid idtype="pmpid" link="fulltext">18488015</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B22">
            <title>
               <p>The transcriptional landscape of the yeast genome defined by RNA sequencing.</p>
            </title>
            <aug>
               <au>
                  <snm>Nagalakshmi</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Wang</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Waern</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Shou</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Raha</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Gerstein</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Snyder</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2008</pubdate>
            <volume>320</volume>
            <fpage>1344</fpage>
            <lpage>1349</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.1158441</pubid>
                  <pubid idtype="pmpid" link="fulltext">18451266</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B23">
            <title>
               <p>Mapping and quantifying mammalian transcriptomes by RNA-Seq.</p>
            </title>
            <aug>
               <au>
                  <snm>Mortazavi</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Williams</snm>
                  <fnm>BA</fnm>
               </au>
               <au>
                  <snm>McCue</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Schaeffer</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Wold</snm>
                  <fnm>B</fnm>
               </au>
            </aug>
            <source>Nat Methods</source>
            <pubdate>2008</pubdate>
            <volume>5</volume>
            <fpage>621</fpage>
            <lpage>628</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nmeth.1226</pubid>
                  <pubid idtype="pmpid" link="fulltext">18516045</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B24">
            <title>
               <p>Stem cell transcriptome profiling via massive-scale mRNA sequencing.</p>
            </title>
            <aug>
               <au>
                  <snm>Cloonan</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Forrest</snm>
                  <fnm>AR</fnm>
               </au>
               <au>
                  <snm>Kolle</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Gardiner</snm>
                  <fnm>BB</fnm>
               </au>
               <au>
                  <snm>Faulkner</snm>
                  <fnm>GJ</fnm>
               </au>
               <au>
                  <snm>Brown</snm>
                  <fnm>MK</fnm>
               </au>
               <au>
                  <snm>Taylor</snm>
                  <fnm>DF</fnm>
               </au>
               <au>
                  <snm>Steptoe</snm>
                  <fnm>AL</fnm>
               </au>
               <au>
                  <snm>Wani</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Bethel</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Robertson</snm>
                  <fnm>AJ</fnm>
               </au>
               <au>
                  <snm>Perkins</snm>
                  <fnm>AC</fnm>
               </au>
               <au>
                  <snm>Bruce</snm>
                  <fnm>SJ</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>CC</fnm>
               </au>
               <au>
                  <snm>Ranade</snm>
                  <fnm>SS</fnm>
               </au>
               <au>
                  <snm>Peckham</snm>
                  <fnm>HE</fnm>
               </au>
               <au>
                  <snm>Manning</snm>
                  <fnm>JM</fnm>
               </au>
               <au>
                  <snm>McKernan</snm>
                  <fnm>KJ</fnm>
               </au>
               <au>
                  <snm>Grimmond</snm>
                  <fnm>SM</fnm>
               </au>
            </aug>
            <source>Nat Methods</source>
            <pubdate>2008</pubdate>
            <volume>5</volume>
            <fpage>613</fpage>
            <lpage>619</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nmeth.1223</pubid>
                  <pubid idtype="pmpid" link="fulltext">18516046</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B25">
            <title>
               <p>RNA-Seq: An assessment of technical reproducibility and comparison with gene expression arrays.</p>
            </title>
            <aug>
               <au>
                  <snm>Marioni</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Mason</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Mane</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Stephens</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Gilad</snm>
                  <fnm>Y</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>1509</fpage>
            <lpage>1517</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2527709</pubid>
                  <pubid idtype="pmpid" link="fulltext">18550803</pubid>
                  <pubid idtype="doi">10.1101/gr.079558.108</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B26">
            <title>
               <p>A global view of gene activity and alternative splicing by deep sequencing of the human transcriptome.</p>
            </title>
            <aug>
               <au>
                  <snm>Sultan</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Schulz</snm>
                  <fnm>MH</fnm>
               </au>
               <au>
                  <snm>Richard</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Magen</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Klingenhoff</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Scherf</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Seifert</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Borodina</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Soldatov</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Parkhomchuk</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Schmidt</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>O'Keeffe</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Haas</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Vingron</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Lehrach</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Yaspo</snm>
                  <fnm>ML</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2008</pubdate>
            <volume>321</volume>
            <fpage>956</fpage>
            <lpage>960</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.1160342</pubid>
                  <pubid idtype="pmpid" link="fulltext">18599741</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B27">
            <title>
               <p>Tag-based approaches for transcriptome research and genome annotation.</p>
            </title>
            <aug>
               <au>
                  <snm>Harbers</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Carninci</snm>
                  <fnm>P</fnm>
               </au>
            </aug>
            <source>Nat Methods</source>
            <pubdate>2005</pubdate>
            <volume>2</volume>
            <fpage>495</fpage>
            <lpage>502</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nmeth768</pubid>
                  <pubid idtype="pmpid" link="fulltext">15973418</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B28">
            <title>
               <p>Serial analysis of gene expression.</p>
            </title>
            <aug>
               <au>
                  <snm>Velculescu</snm>
                  <fnm>VE</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Vogelstein</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Kinzler</snm>
                  <fnm>KW</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>1995</pubdate>
            <volume>270</volume>
            <fpage>484</fpage>
            <lpage>487</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.270.5235.484</pubid>
                  <pubid idtype="pmpid" link="fulltext">7570003</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B29">
            <title>
               <p>Gene expression analysis by massively parallel signature sequencing (MPSS) on microbead arrays.</p>
            </title>
            <aug>
               <au>
                  <snm>Brenner</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Johnson</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Bridgham</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Golda</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Lloyd</snm>
                  <fnm>DH</fnm>
               </au>
               <au>
                  <snm>Johnson</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Luo</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>McCurdy</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Foy</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Ewan</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Roth</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>George</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Eletr</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Albrecht</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Vermaas</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Williams</snm>
                  <fnm>SR</fnm>
               </au>
               <au>
                  <snm>Moon</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Burcham</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Pallas</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>DuBridge</snm>
                  <fnm>RB</fnm>
               </au>
               <au>
                  <snm>Kirchner</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Fearon</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Mao</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Corcoran</snm>
                  <fnm>K</fnm>
               </au>
            </aug>
            <source>Nat Biotechnol</source>
            <pubdate>2000</pubdate>
            <volume>18</volume>
            <fpage>630</fpage>
            <lpage>634</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/76469</pubid>
                  <pubid idtype="pmpid" link="fulltext">10835600</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B30">
            <title>
               <p>Transcriptional maps of 10 human chromosomes at 5-nucleotide resolution.</p>
            </title>
            <aug>
               <au>
                  <snm>Cheng</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Kapranov</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Drenkow</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Dike</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Brubaker</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Patel</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Long</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Stern</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Tammana</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Helt</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Sementchenko</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Piccolboni</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Bekiranov</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Bailey</snm>
                  <fnm>DK</fnm>
               </au>
               <au>
                  <snm>Ganesh</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Ghosh</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Bell</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Gerhard</snm>
                  <fnm>DS</fnm>
               </au>
               <au>
                  <snm>Gingeras</snm>
                  <fnm>TR</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2005</pubdate>
            <volume>308</volume>
            <fpage>1149</fpage>
            <lpage>1154</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.1108625</pubid>
                  <pubid idtype="pmpid" link="fulltext">15790807</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B31">
            <title>
               <p>Assessing the performance of different high-density tiling microarray strategies for mapping transcribed regions of the human genome.</p>
            </title>
            <aug>
               <au>
                  <snm>Emanuelsson</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Nagalakshmi</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Zheng</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Rozowsky</snm>
                  <fnm>JS</fnm>
               </au>
               <au>
                  <snm>Urban</snm>
                  <fnm>AE</fnm>
               </au>
               <au>
                  <snm>Du</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Lian</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Stolc</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Weissman</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Snyder</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Gerstein</snm>
                  <fnm>MB</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2007</pubdate>
            <volume>17</volume>
            <fpage>886</fpage>
            <lpage>897</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1891347</pubid>
                  <pubid idtype="pmpid" link="fulltext">17119069</pubid>
                  <pubid idtype="doi">10.1101/gr.5014606</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B32">
            <title>
               <p>Genome-wide transcription and the implications for genomic organization.</p>
            </title>
            <aug>
               <au>
                  <snm>Kapranov</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Willingham</snm>
                  <fnm>AT</fnm>
               </au>
               <au>
                  <snm>Gingeras</snm>
                  <fnm>TR</fnm>
               </au>
            </aug>
            <source>Nat Rev Genet</source>
            <pubdate>2007</pubdate>
            <volume>8</volume>
            <fpage>413</fpage>
            <lpage>423</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nrg2083</pubid>
                  <pubid idtype="pmpid" link="fulltext">17486121</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B33">
            <title>
               <p>Gene discovery in dbEST.</p>
            </title>
            <aug>
               <au>
                  <snm>Boguski</snm>
                  <fnm>MS</fnm>
               </au>
               <au>
                  <snm>Tolstoshev</snm>
                  <fnm>CM</fnm>
               </au>
               <au>
                  <snm>Bassett</snm>
                  <fnm>DE</fnm>
                  <suf>Jr</suf>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>1994</pubdate>
            <volume>265</volume>
            <fpage>1993</fpage>
            <lpage>1994</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.8091218</pubid>
                  <pubid idtype="pmpid" link="fulltext">8091218</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B34">
            <title>
               <p>The status, quality, and expansion of the NIH full-length cDNA project: the Mammalian Gene Collection (MGC).</p>
            </title>
            <aug>
               <au>
                  <snm>Gerhard</snm>
                  <fnm>DS</fnm>
               </au>
               <au>
                  <snm>Wagner</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Feingold</snm>
                  <fnm>EA</fnm>
               </au>
               <au>
                  <snm>Shenmen</snm>
                  <fnm>CM</fnm>
               </au>
               <au>
                  <snm>Grouse</snm>
                  <fnm>LH</fnm>
               </au>
               <au>
                  <snm>Schuler</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Klein</snm>
                  <fnm>SL</fnm>
               </au>
               <au>
                  <snm>Old</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Rasooly</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Good</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Guyer</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Peck</snm>
                  <fnm>AM</fnm>
               </au>
               <au>
                  <snm>Derge</snm>
                  <fnm>JG</fnm>
               </au>
               <au>
                  <snm>Lipman</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Collins</snm>
                  <fnm>FS</fnm>
               </au>
               <au>
                  <snm>Jang</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Sherry</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Feolo</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Misquitta</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Rotmistrovsky</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Greenhut</snm>
                  <fnm>SF</fnm>
               </au>
               <au>
                  <snm>Schaefer</snm>
                  <fnm>CF</fnm>
               </au>
               <au>
                  <snm>Buetow</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Bonner</snm>
                  <fnm>TI</fnm>
               </au>
               <au>
                  <snm>Haussler</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Kent</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Kiekhaus</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Furey</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Brent</snm>
                  <fnm>M</fnm>
               </au>
               <etal/>
            </aug>
            <source>Genome Res</source>
            <pubdate>2004</pubdate>
            <volume>14</volume>
            <fpage>2121</fpage>
            <lpage>2127</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">528928</pubid>
                  <pubid idtype="pmpid" link="fulltext">15489334</pubid>
                  <pubid idtype="doi">10.1101/gr.2596504</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B35">
            <title>
               <p>Whole genome sequence comparisons and "full-length" cDNA sequences: a combined approach to evaluate and improve <it>Arabidopsis </it>genome annotation.</p>
            </title>
            <aug>
               <au>
                  <snm>Castelli</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Aury</snm>
                  <fnm>JM</fnm>
               </au>
               <au>
                  <snm>Jaillon</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Wincker</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Clepet</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Menard</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Cruaud</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Quetier</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Scarpelli</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Schachter</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Temple</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Caboche</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Weissenbach</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Salanoubat</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2004</pubdate>
            <volume>14</volume>
            <fpage>406</fpage>
            <lpage>413</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">353228</pubid>
                  <pubid idtype="pmpid" link="fulltext">14993207</pubid>
                  <pubid idtype="doi">10.1101/gr.1515604</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B36">
            <title>
               <p>EGASP: the human ENCODE Genome Annotation Assessment Project.</p>
            </title>
            <aug>
               <au>
                  <snm>Guigo</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Flicek</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Abril</snm>
                  <fnm>JF</fnm>
               </au>
               <au>
                  <snm>Reymond</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Lagarde</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Denoeud</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Antonarakis</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Ashburner</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Bajic</snm>
                  <fnm>VB</fnm>
               </au>
               <au>
                  <snm>Birney</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Castelo</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Eyras</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Ucla</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Gingeras</snm>
                  <fnm>TR</fnm>
               </au>
               <au>
                  <snm>Harrow</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Hubbard</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Lewis</snm>
                  <fnm>SE</fnm>
               </au>
               <au>
                  <snm>Reese</snm>
                  <fnm>MG</fnm>
               </au>
            </aug>
            <source>Genome Biol</source>
            <pubdate>2006</pubdate>
            <volume>7 Suppl 1</volume>
            <fpage>S2.1</fpage>
            <lpage>S2.31</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmpid" link="fulltext">16925836</pubid>
                  <pubid idtype="doi">10.1186/gb-2006-7-s1-s2</pubid>
                  <pubid idtype="pmcid">1810551</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B37">
            <title>
               <p>EST_GENOME: a program to align spliced DNA sequences to unspliced genomic DNA.</p>
            </title>
            <aug>
               <au>
                  <snm>Mott</snm>
                  <fnm>R</fnm>
               </au>
            </aug>
            <source>Comput Appl Biosci</source>
            <pubdate>1997</pubdate>
            <volume>13</volume>
            <fpage>477</fpage>
            <lpage>478</lpage>
            <xrefbib>
               <pubid idtype="pmpid">9283765</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B38">
            <title>
               <p>BLAT--the BLAST-like alignment tool.</p>
            </title>
            <aug>
               <au>
                  <snm>Kent</snm>
                  <fnm>WJ</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2002</pubdate>
            <volume>12</volume>
            <fpage>656</fpage>
            <lpage>664</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">187518</pubid>
                  <pubid idtype="pmpid" link="fulltext">11932250</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B39">
            <title>
               <p>GeneWise and Genomewise.</p>
            </title>
            <aug>
               <au>
                  <snm>Birney</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Clamp</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Durbin</snm>
                  <fnm>R</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2004</pubdate>
            <volume>14</volume>
            <fpage>988</fpage>
            <lpage>995</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">479130</pubid>
                  <pubid idtype="pmpid" link="fulltext">15123596</pubid>
                  <pubid idtype="doi">10.1101/gr.1865504</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B40">
            <title>
               <p>Automated generation of heuristics for biological sequence comparison.</p>
            </title>
            <aug>
               <au>
                  <snm>Slater</snm>
                  <fnm>GS</fnm>
               </au>
               <au>
                  <snm>Birney</snm>
                  <fnm>E</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2005</pubdate>
            <volume>6</volume>
            <fpage>31</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">553969</pubid>
                  <pubid idtype="pmpid" link="fulltext">15713233</pubid>
                  <pubid idtype="doi">10.1186/1471-2105-6-31</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B41">
            <title>
               <p>A computer program for aligning a cDNA sequence with a genomic DNA sequence.</p>
            </title>
            <aug>
               <au>
                  <snm>Florea</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Hartzell</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Rubin</snm>
                  <fnm>GM</fnm>
               </au>
               <au>
                  <snm>Miller</snm>
                  <fnm>W</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>1998</pubdate>
            <volume>8</volume>
            <fpage>967</fpage>
            <lpage>974</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">310774</pubid>
                  <pubid idtype="pmpid" link="fulltext">9750195</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B42">
            <title>
               <p>Improving the <it>Arabidopsis </it>genome annotation using maximal transcript alignment assemblies.</p>
            </title>
            <aug>
               <au>
                  <snm>Haas</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Delcher</snm>
                  <fnm>AL</fnm>
               </au>
               <au>
                  <snm>Mount</snm>
                  <fnm>SM</fnm>
               </au>
               <au>
                  <snm>Wortman</snm>
                  <fnm>JR</fnm>
               </au>
               <au>
                  <snm>Smith</snm>
                  <fnm>RK</fnm>
                  <suf>Jr</suf>
               </au>
               <au>
                  <snm>Hannick</snm>
                  <fnm>LI</fnm>
               </au>
               <au>
                  <snm>Maiti</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Ronning</snm>
                  <fnm>CM</fnm>
               </au>
               <au>
                  <snm>Rusch</snm>
                  <fnm>DB</fnm>
               </au>
               <au>
                  <snm>Town</snm>
                  <fnm>CD</fnm>
               </au>
               <au>
                  <snm>Salzberg</snm>
                  <fnm>SL</fnm>
               </au>
               <au>
                  <snm>White</snm>
                  <fnm>O</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2003</pubdate>
            <volume>31</volume>
            <fpage>5654</fpage>
            <lpage>5666</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">206470</pubid>
                  <pubid idtype="pmpid" link="fulltext">14500829</pubid>
                  <pubid idtype="doi">10.1093/nar/gkg770</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B43">
            <title>
               <p>Optimal spliced alignments of short sequence reads.</p>
            </title>
            <aug>
               <au>
                  <snm>De Bona</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Ossowski</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Schneeberger</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Ratsch</snm>
                  <fnm>G</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2008</pubdate>
            <volume>24</volume>
            <fpage>i174</fpage>
            <lpage>180</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btn300</pubid>
                  <pubid idtype="pmpid" link="fulltext">18689821</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B44">
            <title>
               <p>The grapevine genome sequence suggests ancestral hexaploidization in major angiosperm phyla.</p>
            </title>
            <aug>
               <au>
                  <snm>Jaillon</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Aury</snm>
                  <fnm>JM</fnm>
               </au>
               <au>
                  <snm>Noel</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Policriti</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Clepet</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Casagrande</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Choisne</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Aubourg</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Vitulo</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Jubin</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Vezzi</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Legeai</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Hugueney</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Dasilva</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Horner</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Mica</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Jublot</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Poulain</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Bruyere</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Billault</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Segurens</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Gouyvenoux</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Ugarte</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Cattonaro</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Anthouard</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Vico</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Del Fabbro</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Alaux</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Di Gaspero</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Dumas</snm>
                  <fnm>V</fnm>
               </au>
               <etal/>
            </aug>
            <source>Nature</source>
            <pubdate>2007</pubdate>
            <volume>449</volume>
            <fpage>463</fpage>
            <lpage>467</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nature06148</pubid>
                  <pubid idtype="pmpid" link="fulltext">17721507</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B45">
            <title>
               <p>G-Mo.R-Se Website</p>
            </title>
            <url>http://www.genoscope.cns.fr/gmorse</url>
         </bibl>
         <bibl id="B46">
            <title>
               <p>ASEtrap: a biological method for speeding up the exploration of spliceomes.</p>
            </title>
            <aug>
               <au>
                  <snm>Thill</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Castelli</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Pallud</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Salanoubat</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Wincker</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>de la Grange</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Auboeuf</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Schachter</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Weissenbach</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2006</pubdate>
            <volume>16</volume>
            <fpage>776</fpage>
            <lpage>786</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1479860</pubid>
                  <pubid idtype="pmpid" link="fulltext">16682744</pubid>
                  <pubid idtype="doi">10.1101/gr.5063306</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B47">
            <title>
               <p>Genome-wide analysis of alternative pre-mRNA splicing in <it>Arabidopsis </it>thaliana based on full-length cDNA sequences.</p>
            </title>
            <aug>
               <au>
                  <snm>Iida</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Seki</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Sakurai</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Satou</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Akiyama</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Toyoda</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Konagaya</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Shinozaki</snm>
                  <fnm>K</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2004</pubdate>
            <volume>32</volume>
            <fpage>5096</fpage>
            <lpage>5103</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">521658</pubid>
                  <pubid idtype="pmpid" link="fulltext">15452276</pubid>
                  <pubid idtype="doi">10.1093/nar/gkh845</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B48">
            <title>
               <p>Gapped BLAST and PSI-BLAST: a new generation of protein database search programs.</p>
            </title>
            <aug>
               <au>
                  <snm>Altschul</snm>
                  <fnm>SF</fnm>
               </au>
               <au>
                  <snm>Madden</snm>
                  <fnm>TL</fnm>
               </au>
               <au>
                  <snm>Schaffer</snm>
                  <fnm>AA</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Miller</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Lipman</snm>
                  <fnm>DJ</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>1997</pubdate>
            <volume>25</volume>
            <fpage>3389</fpage>
            <lpage>3402</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">146917</pubid>
                  <pubid idtype="pmpid" link="fulltext">9254694</pubid>
                  <pubid idtype="doi">10.1093/nar/25.17.3389</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B49">
            <title>
               <p>The universal protein resource (UniProt).</p>
            </title>
            <source>Nucleic Acids Res</source>
            <pubdate>2008</pubdate>
            <volume>36</volume>
            <fpage>D190</fpage>
            <lpage>195</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2238893</pubid>
                  <pubid idtype="pmpid" link="fulltext">18045787</pubid>
                  <pubid idtype="doi">10.1093/nar/gkn141</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B50">
            <title>
               <p>Grape Genome Browser</p>
            </title>
            <url>http://www.genoscope.cns.fr/vitis</url>
         </bibl>
         <bibl id="B51">
            <title>
               <p>GeneID in <it>Drosophila</it>.</p>
            </title>
            <aug>
               <au>
                  <snm>Parra</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Blanco</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Guigo</snm>
                  <fnm>R</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2000</pubdate>
            <volume>10</volume>
            <fpage>511</fpage>
            <lpage>515</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">310871</pubid>
                  <pubid idtype="pmpid" link="fulltext">10779490</pubid>
                  <pubid idtype="doi">10.1101/gr.10.4.511</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B52">
            <title>
               <p>Gene finding in novel genomes.</p>
            </title>
            <aug>
               <au>
                  <snm>Korf</snm>
                  <fnm>I</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2004</pubdate>
            <volume>5</volume>
            <fpage>59</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">421630</pubid>
                  <pubid idtype="pmpid" link="fulltext">15144565</pubid>
                  <pubid idtype="doi">10.1186/1471-2105-5-59</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
      </refgrp>
   </bm>
</art>

