001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.program.abi;
022
023import java.util.ArrayList;
024import java.util.Iterator;
025import java.util.List;
026
027import org.biojava.bio.BioError;
028import org.biojava.bio.alignment.Alignment;
029import org.biojava.bio.seq.DNATools;
030import org.biojava.bio.seq.io.CharacterTokenization;
031import org.biojava.bio.seq.io.SymbolTokenization;
032import org.biojava.bio.symbol.AbstractAlphabet;
033import org.biojava.bio.symbol.AlphabetManager;
034import org.biojava.bio.symbol.AtomicSymbol;
035import org.biojava.bio.symbol.FiniteAlphabet;
036import org.biojava.bio.symbol.IllegalAlphabetException;
037import org.biojava.bio.symbol.IntegerAlphabet;
038import org.biojava.bio.symbol.Symbol;
039import org.biojava.bio.symbol.SymbolList;
040import org.biojava.bio.symbol.SymbolListViews;
041import org.biojava.utils.ListTools;
042
043/**
044 * Useful functionality for working with fasta files where the quality of the
045 * DNA is encoded as upper and lower case DNA characters.
046 *
047 * @author Matthew Pocock
048 */
049public class ABITools {
050  /**
051   * The quality alphabet. This is equivalent to DNA x [0,1] where 0 represents
052   * poorly supported (lower case) and 1 represents strongly supported (upper
053   * case).
054   */
055  public static final FiniteAlphabet QUALITY;
056
057  /**
058   * The poorly supported symbol.
059   */
060  public static final AtomicSymbol _0;
061
062  /**
063   * The well supported symbol.
064   */
065  public static final AtomicSymbol _1;
066
067  /**
068   * Alignment label for the DNA sequence row.
069   */
070  public static final Object SEQUENCE = "SEQUENCE";
071
072  /**
073   * Alignment label for the support row.
074   */
075  public static final Object SUPPORT = "SUPPORT";
076
077  static {
078    try {
079      IntegerAlphabet.SubIntegerAlphabet _01
080      = IntegerAlphabet.getSubAlphabet(0, 1);
081      _0 = _01.getSymbol(0);
082      _1 = _01.getSymbol(1);
083
084      List alphas = new ArrayList();
085      alphas.add(DNATools.getDNA());
086      alphas.add(_01);
087
088      // naughty here - we know because we are insiders that the result of this
089      // call will be an AbstractAlphabet impl
090      AbstractAlphabet quality = (AbstractAlphabet) AlphabetManager.getCrossProductAlphabet(alphas);
091      CharacterTokenization tok = new CharacterTokenization(quality, true);
092
093      // all lower case characters go to sym,0
094      // all upper case characters go to sym,1
095      SymbolList sl = DNATools.createDNA("agctrymkswhbvdn");
096      ListTools.Doublet pair = new ListTools.Doublet();
097      SymbolTokenization dnaTok = DNATools.getDNA().getTokenization("token");
098      for(Iterator i = sl.iterator(); i.hasNext(); ) {
099        pair.setA((Symbol) i.next());
100        String c = dnaTok.tokenizeSymbol((Symbol) pair.getA());
101
102        pair.setB(_1);
103        tok.bindSymbol(quality.getSymbol(pair), c.toUpperCase().charAt(0));
104
105        pair.setB(_0);
106        tok.bindSymbol(quality.getSymbol(pair), c.toLowerCase().charAt(0));
107      }
108
109      quality.putTokenization("token", tok);
110      QUALITY = quality;
111    } catch (Exception e) {
112      throw new BioError("Could not initialize ABI quality alphabet",e);
113    }
114  }
115
116  /**
117   * <p>
118   * View a symbol list over the QUALITY alphabet as an alignment.
119   * </p>
120   *
121   * <p>
122   * The alignment will have labels of SEQUENCE and SUPPORT that retrieve the
123   * DNA sequence and the binary support values respectively.
124   * </p>
125   *
126   * @param abiSeq  the SymbolList over the QUALITY alphabet to view
127   * @return an Alignment view of abiSeq
128   * @throws IllegalAlphabetException if abiSeq is not over QUALITY
129   */
130  public static Alignment getAlignment(SymbolList abiSeq)
131  throws IllegalAlphabetException {
132    return SymbolListViews.alignment(
133      new ListTools.Doublet(SEQUENCE, SUPPORT),
134      abiSeq
135    );
136  }
137}