BioJava:CookbookFrench:Distribution:Composition

Comment calculer la composition d’une ou plusieurs Sequences?

Le programme suivant est une démonstration complète capable de calucler la composition d’une ou plusieurs SymbolLists ou RichSequence. Cette application peut compter les mots de n’importe quelle taille et peut le faire de manière à trouver le mots qui se recoupent ou non (triplets ou codons par exemple).

Le programme utilise la librairie CLI pour le traitement des options de la ligne de commande et utilise les types génériques pour la sécurité des types. Il fait aussi la démonstration de l’usage de l’architecture I/O BioJavax en incluant la particularisation capable d’ignorer certaines informations commes les caractéristiques et les commentaires, sans importance pour la calcul de la composition.

```java /*

* Composition.java
*
* Created on October 10, 2005, 2:30 PM
*/

import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.biojava.bio.BioError; import org.biojava.bio.BioException; import org.biojava.bio.dist.Distribution; import org.biojava.bio.dist.DistributionFactory; import org.biojava.bio.dist.DistributionTools; import org.biojava.bio.dist.DistributionTrainerContext; import org.biojava.bio.dist.SimpleDistributionTrainerContext; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.SequenceIterator; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alphabet; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.AtomicSymbol; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.IllegalAlphabetException; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.Symbol; import org.biojava.bio.symbol.SymbolList; import org.biojava.bio.symbol.SymbolListViews; import org.biojava.utils.ChangeVetoException; import org.biojavax.RichObjectFactory; import org.biojavax.bio.seq.RichSequenceIterator; import org.biojavax.bio.seq.io.EMBLFormat; import org.biojavax.bio.seq.io.FastaFormat; import org.biojavax.bio.seq.io.GenbankFormat; import org.biojavax.bio.seq.io.INSDseqFormat; import org.biojavax.bio.seq.io.RichSequenceBuilderFactory; import org.biojavax.bio.seq.io.RichSequenceFormat; import org.biojavax.bio.seq.io.RichStreamReader; import org.biojavax.bio.seq.io.UniProtFormat;

/**

* Determine the compostion of a group of sequences.
* @author Mark Schreiber
*/

public class Composition {

   private Alphabet alpha;
   private SequenceIterator iter;
   
   /** Creates a new instance of Composition */
   public Composition() {
   }
      
   /**
    * Determine the composition of a single SymbolList.
    * @param sl The SymbolList to determine the composition of.
    * @throws org.biojava.bio.symbol.IllegalAlphabetException if a 
    * Distribution cannot be made for this Alphabet
    * @throws org.biojava.bio.symbol.IllegalSymbolException if a Symbol from another 
    * Alphabet is presented to the
    * DistributionTrainer.
    * @throws org.biojava.bio.BioException unlikely to occur unless calculating windowed
    * composition on a sequence not evenly divisible
    * by the window length. Can also occur if a
    * SymbolList or RichSequence
    * is unavailable from an iterator or if
    * a Distribution somehow becomes
    * locked during training.
    * @return a Distribution representing the
    * calculated composition.
    */
   public Distribution compostion(SymbolList sl) 
           throws IllegalAlphabetException, IllegalSymbolException, BioException{
       Set` set = Collections.singleton(sl);` `       return averageCompostion(set.iterator(), 1, false);` `   }` `   ` `   /**` `    * Determine the composition of higer order words from` `    * a single ``SymbolList``. Optionally windowed` `    * (non-overlapping) or overlapping words can be used. Codons` `    * would be an example of 3rd order windowed words.` `    * @param sl The ``SymbolList`` to determine the composition of.` `    * @param order the order of words to count (eg for triplets use 3)` `    * @param windowed true to count non-overlapping words (eg codons).` `    * @throws org.biojava.bio.symbol.IllegalAlphabetException if a ``Distribution`` ` `    * cannot be made for this ``Alphabet` `    * @throws org.biojava.bio.symbol.IllegalSymbolException if a ``Symbol`` from another ` `    * ``Alphabet`` is presented to the` `    * ``DistributionTrainer``.` `    * @throws org.biojava.bio.BioException unlikely to occur unless calculating windowed` `    * composition on a sequence not evenly divisible` `    * by the window length. Can also occur if a` `    * ``SymbolList`` or ``RichSequence` `    * is unavailable from an iterator or if` `    * a ``Distribution`` somehow becomes` `    * locked during training.` `    * @return a ``Distribution`` representing the` `    * calculated composition.` `    */` `   public Distribution compostion(SymbolList sl, int order, boolean windowed) ` `           throws IllegalAlphabetException, IllegalSymbolException, BioException{` `       Set`` set = Collections.singleton(sl);` `       return averageCompostion(set.iterator(), order, windowed);` `   }` `   ` `   /**` `    * Determine the average composition of a collection of` `    * ``SymbolList``s.` `    * @param iter an iterator over ``SymbolList``s.` `    * @throws org.biojava.bio.symbol.IllegalAlphabetException if a ` `    * ``Distribution`` cannot be made for this ``Alphabet` `    * @throws org.biojava.bio.symbol.IllegalSymbolException if a ``Symbol`` from another ` `    * ``Alphabet`` is presented to the` `    * ``DistributionTrainer``.` `    * @throws org.biojava.bio.BioException unlikely to occur unless calculating windowed` `    * composition on a sequence not evenly divisible` `    * by the window length. Can also occur if a` `    * ``SymbolList`` or ``RichSequence` `    * is unavailable from an iterator or if` `    * a ``Distribution`` somehow becomes` `    * locked during training.` `    * @return a ``Distribution`` representing the` `    * calculated composition.` `    */` `   public Distribution averageCompostion(Iterator`<? extends SymbolList>` iter) ` `      throws IllegalAlphabetException, IllegalSymbolException, BioException` `   {` `       return this.averageCompostion(iter, 1, false);` `   }` `   ` `   /**` `    * Determine the average composition of higer order words from` `    * a collection of ``SymbolList``s. Optionally windowed` `    * (non-overlapping) or overlapping words can be used. Codons` `    * would be an example of 3rd order windowed words.` `    * @param iter an iterator over ``SymbolList``s.` `    * @param order the order of words to count (eg for triplets use 3)` `    * @param windowed true to count non-overlapping words (eg codons).` `    * @throws org.biojava.bio.symbol.IllegalAlphabetException if a ``Distribution` `    * cannot be made for this ``Alphabet` `    * @throws org.biojava.bio.symbol.IllegalSymbolException if a ``Symbol`` from another ` `    * ``Alphabet`` is presented to the` `    * ``DistributionTrainer``.` `    * @throws org.biojava.bio.BioException unlikely to occur unless calculating windowed` `    * composition on a sequence not evenly divisible` `    * by the window length. Can also occur if a` `    * ``SymbolList`` or ``RichSequence` `    * is unavailable from an iterator or if` `    * a ``Distribution`` somehow becomes` `    * locked during training.` `    * @return a ``Distribution`` representing the` `    * calculated composition.` `    */` `   public Distribution averageCompostion(Iterator`<? extends SymbolList>` iter, int order, boolean windowed)` `               throws IllegalAlphabetException, IllegalSymbolException, BioException{` `       ` `       DistributionTrainerContext dtc = new SimpleDistributionTrainerContext();` `       Distribution d = null;` `       ` `       if(order > 1){` `           iter = this.nmerView(iter, order, windowed);` `       }` `                     ` `       while(iter.hasNext()){` `           SymbolList sl = iter.next();` `           d = DistributionFactory.DEFAULT.createDistribution(sl.getAlphabet());` `           dtc.registerDistribution(d);    ` `           for(Iterator i = sl.iterator(); i.hasNext();){` `               dtc.addCount(d, (Symbol)i.next(), 1.0);` `           }` `       }` `       try{` `           dtc.train();` `       }catch(ChangeVetoException ex){` `           throw new Error("Cannot train distribution", ex); //impossible` `       }` `       return d;` `   }` `           ` `   /**` `    * Determine the average composition of ` `    * a collection of ``RichSequence``s.` `    * @param iter an iterator over ``RichSequences``s.` `    * @throws org.biojava.bio.symbol.IllegalAlphabetException if a ` `    * ``Distribution`` cannot be made for this ``Alphabet` `    * @throws org.biojava.bio.symbol.IllegalSymbolException if a ``Symbol`` from another ` `    * ``Alphabet`` is presented to the` `    * ``DistributionTrainer``.` `    * @throws org.biojava.bio.BioException unlikely to occur unless calculating windowed` `    * composition on a sequence not evenly divisible` `    * by the window length. Can also occur if a` `    * ``SymbolList`` or ``RichSequence` `    * is unavailable from an iterator or if` `    * a ``Distribution`` somehow becomes` `    * locked during training.` `    * @return a ``Distribution`` representing the` `    * calculated composition.` `    */` `   public Distribution averageComposition(RichSequenceIterator iter) ` `       throws IllegalAlphabetException, IllegalSymbolException, BioException{` `       return averageCompostion(this.asIterator(iter), 1, false);` `   }` `   ` `   /**` `    * Determine the average composition of higer order words from` `    * a collection of ``RichSequence``s. Optionally windowed` `    * (non-overlapping) or overlapping words can be used. Codons` `    * would be an example of 3rd order windowed words.` `    * @param iter an iterator over ``RichSequences``s.` `    * @param order the order of words to count (eg for triplets use 3)` `    * @param windowed true to count non-overlapping words (eg codons).` `    * @throws org.biojava.bio.symbol.IllegalAlphabetException if a ``Distribution` `    * cannot be made for this ``Alphabet` `    * @throws org.biojava.bio.symbol.IllegalSymbolException if a ``Symbol`` from another ` `    * ``Alphabet`` is presented to the` `    * ``DistributionTrainer``.` `    * @throws org.biojava.bio.BioException unlikely to occur unless calculating windowed` `    * composition on a sequence not evenly divisible` `    * by the window length. Can also occur if a` `    * ``SymbolList`` or ``RichSequence` `    * is unavailable from an iterator or if` `    * a ``Distribution`` somehow becomes` `    * locked during training.` `    * @return a ``Distribution`` representing the` `    * calculated composition.` `    */` `   public Distribution averageComposition(RichSequenceIterator iter, int order, boolean windowed) ` `       throws IllegalAlphabetException, IllegalSymbolException, BioException{` `       return averageCompostion(this.asIterator(iter), order, windowed);` `   }` `   ` `   /**` `    * Display help on the use of the program.` `    */` `   public static void help(){` `       HelpFormatter helpf = new HelpFormatter();` `       helpf.printHelp("java Composition [options]", options());` `       System.exit(0);` `   }` `   ` `   protected static Options options(){` `       Options options = new Options();` `       ` `       Option file = new Option("i", "infile", true, "A sequence file");` `              file.setRequired(true);` `       Option format = new Option("f", "format", true, "infile format. "+` `               "Can be a common name, eg fasta, or a fully qualified "+` `               "class name, eg org.biojavax.bio.seq.io.FastaFormat");` `              format.setRequired(true);` `       Option alpha = new Option(` `                        "a", "alphabet name", true, "the name of the Alphabet eg DNA, RNA, Protein");` `              alpha.setRequired(true);` `       Option order = new Option(` `                        "o", "order", true, "and int value, the order of the nmers analysed, default is 1");` `              order.setRequired(false);` `       Option windowed = new Option(` `                           "w", "windowed", false,` `                           "optional flag to use windowed nmers instead of sliding nmers");` `              windowed.setRequired(false);` `       Option verbose = new Option(` `                         "v", "verbose", false,` `                         "print summary to screen, if x is not set then this is true by default");` `              verbose.setRequired(false);` `       Option output = new Option("x", "output", true, "output xml to the named file");` `              output.setRequired(false);` `       ` `       options.addOption(file);` `       options.addOption(format);` `       options.addOption(alpha);` `       options.addOption(order);` `       options.addOption(windowed);` `       options.addOption(verbose);` `       options.addOption(output);` `       ` `       return options;` `   }` `   ` `   /**` `    * Takes each ``SymbolList`` from the ``Iterator`` and applies` `    * a view to it. The view can be windowed (eg codons) or` `    * sliding (eg overlapping dimers)` `    * @param iter The input iterator` `    * @param nmerSize The size of the window eg 3 for codons. ` `    * If the size is less than 2 then you get back ` `    * the original ``Iterator` `    * @param windowed true if you want non-overlapping nmers (eg codons),` `    * false if you want them to overlap.` `    * @return An ``Iterator`` over ``SymbolLists`` with the ` `    * desired view applied. ``You cannot call ``remove()`` on this iterator!` `    */` `   public Iterator`` nmerView(` `           Iterator`<? extends SymbolList>` iter,` `           int nmerSize,` `           boolean windowed){` `       ` `       if(nmerSize < 2) return (Iterator``)iter;` `       ` `       final Iterator`<? extends SymbolList>` it = iter;` `       final int size = nmerSize;` `       final boolean w = windowed;` `       return new Iterator``(){` `           public boolean hasNext(){` `               return it.hasNext();` `           }` `           public SymbolList next() {` `               try{` `                 SymbolList source = it.next();` `                 if(w){` `                     return SymbolListViews.windowedSymbolList(source, size);` `                 }else{` `                     return SymbolListViews.orderNSymbolList(source, size);` `                 }` `               }catch(BioException e){` `                   NoSuchElementException ex = new NoSuchElementException();` `                   ex.initCause(e);` `                   throw ex;` `               }` `           }` `           public void remove(){` `               throw new UnsupportedOperationException();` `           }` `       };` `   }` `   ` `   /**` `    * Makes a ``SequenceIterator`` look like an ` `    * ``Iterator {@code }` `    * @param iter The ``SequenceIterator` `    * @return An ``Iterator`` that returns only ``Sequence` `    * objects. ``You cannot call ``remove()`` on this iterator!` `    */` `   public Iterator`` asIterator(SequenceIterator iter){` `       final SequenceIterator it = iter;` `       return new Iterator``(){` `           public boolean hasNext(){` `               return it.hasNext();` `           }` `           public Sequence next() {` `               try{` `                 return it.nextSequence();` `               }catch(BioException e){` `                   NoSuchElementException ex = new NoSuchElementException();` `                   ex.initCause(e);` `                   throw ex;` `               }` `           }` `           public void remove(){` `               throw new UnsupportedOperationException();` `           }` `       };` `   }` `   ` `   public static void writeDistributionAsText(Distribution d, ` `           PrintStream out, char seperator, int decimalPlaces) throws IOException{` `       ` `       NumberFormat format = NumberFormat.getInstance();` `       format.setMaximumFractionDigits(decimalPlaces);` `       FiniteAlphabet alpha = (FiniteAlphabet)d.getAlphabet();` `       List`` toke = new ArrayList``();` `               ` `       //for each component alphabet get the tokenization` `       for(Iterator it = alpha.getAlphabets().iterator(); it.hasNext();){` `           Alphabet component = (Alphabet)it.next();` `           try{` `             toke.add(component.getTokenization("token"));` `           }catch(Exception ex){` `               //no tokenization` `               toke.add(null);` `           }` `       }` `               ` `       for(Iterator it = alpha.iterator(); it.hasNext();){` `           Symbol s = (Symbol)it.next();` `           StringBuilder sname = new StringBuilder();` `           ` `           List symbols = ((AtomicSymbol)s).getSymbols();` `           for(int i = 0; i < symbols.size(); i++){` `               if(i > 0) sname.append(' ');` `               Symbol sym = (Symbol)symbols.get(i);` `               if(toke.get(i) != null){` `                   try{` `                       sname.append(toke.get(i).tokenizeSymbol(sym));` `                   }catch(IllegalSymbolException ex){` `                       throw new BioError(ex); //should never happen.` `                   }` `               }else{` `                   sname.append(sym.getName());` `               }` `           }   ` `           ` `           try{` `             out.print(sname.toString()+seperator+` `                   format.format(d.getWeight(s))+"\n");` `           }catch(IllegalSymbolException e){` `               throw new BioError(e); //this should never happen in this case` `           }` `       }` `       out.flush();` `       out.close();` `   }` `   ` `   /**` `    * Attempts to find a format for a name String such as "genbank" or for a` `    * fully qualified string like org.biojavax.bio.seq.io.UniProtFormat` `    * @return the matching ``RichSequenceFormat` `    * @param name the name of the format, case insensitive except for qualified class names` `    * @throws java.lang.IllegalAccessException If java cannot reflectively access the named format.` `    * Only applies to fully qualified class names.` `    * @throws java.lang.ClassNotFoundException If a format can not be found for the name.` `    * @throws java.lang.InstantiationException If the found object cannot be created (only applies` `    * to fully qualified class names).` `    */` `   public static RichSequenceFormat formatForName(String name) ` `           throws ClassNotFoundException, InstantiationException, IllegalAccessException{` `       //determine the format to use` `       RichSequenceFormat format;` `       if(name.equalsIgnoreCase("fasta")){` `           format = new FastaFormat();` `       }` `       else if(name.equalsIgnoreCase("genbank")){` `           format = new GenbankFormat();` `       }` `       else if(name.equalsIgnoreCase("uniprot")){` `           format = new UniProtFormat();` `       }` `       else if(name.equalsIgnoreCase("embl")){` `           format = new EMBLFormat();` `       }` `       else if(name.equalsIgnoreCase("INSDseq")){` `           format = new INSDseqFormat();` `       }` `       else{` `           Class formatClass = Class.forName(name);` `           format = (RichSequenceFormat)formatClass.newInstance();` `       }` `       return format;` `   }` `   ` `   /**` `    * Use this class as an application` `    * @param args the command line arguments` `    * @throws java.lang.Exception if something goes wrong` `    */` `   public static void main(String[] args) throws Exception{` `       ` `       CommandLineParser cliparser = new PosixParser();` `       CommandLine cmd = null;` `       try{` `           cmd = cliparser.parse(options(), args, true);` `       }catch(Exception e){` `           help();` `       }` `       ` `       BufferedReader br = new BufferedReader(` `               new FileReader(cmd.getOptionValue('i')));` `       ` `       RichSequenceFormat format = ` `               formatForName(cmd.getOptionValue('f'));` `       SymbolTokenization toke = null;` `       ` `       try{` `           toke = AlphabetManager.alphabetForName(` `               cmd.getOptionValue('a')).getTokenization("token");` `       }catch(NoSuchElementException ex){` `           //try it upper case` `           toke = AlphabetManager.alphabetForName(` `               cmd.getOptionValue('a').toUpperCase()).getTokenization("token");` `       }` `       int order = Integer.parseInt(cmd.getOptionValue('o', "1"));` `       boolean windowed = cmd.hasOption('w');` `       ` `       ` `       format.setElideComments(true); //don't need these` `       format.setElideFeatures(true);   //don't need these` `       format.setElideReferences(true); //don't need these` `       RichStreamReader sr = new  RichStreamReader(` `               br, format, toke, ` `               RichSequenceBuilderFactory.THRESHOLD, ` `               RichObjectFactory.getDefaultNamespace());` `       ` `       Composition compo = new Composition();` `       Distribution average = compo.averageComposition(sr, order, windowed);` `       ` `       if(cmd.hasOption('v') || cmd.hasOption('x') == false){` `          writeDistributionAsText(average, System.out, ',', 8);` `       }` `       ` `       if(cmd.hasOption('x')){` `           String filename = cmd.getOptionValue('x');` `           try{` `               DistributionTools.writeToXML(` `                       average, new FileOutputStream(filename));` `           }catch(Exception e){` `               System.err.println("Couldn't write "+filename);` `               e.printStackTrace(System.err);` `           }` `       }` `   }`

}

```