Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.seq.io;
024
025import java.io.Serializable;
026import java.util.HashMap;
027import java.util.Iterator;
028import java.util.Map;
029
030import org.biojava.bio.Annotation;
031import org.biojava.bio.BioError;
032import org.biojava.bio.symbol.Alphabet;
033import org.biojava.bio.symbol.AlphabetManager;
034import org.biojava.bio.symbol.FiniteAlphabet;
035import org.biojava.bio.symbol.IllegalAlphabetException;
036import org.biojava.bio.symbol.IllegalSymbolException;
037import org.biojava.bio.symbol.Symbol;
038import org.biojava.bio.symbol.SymbolList;
039import org.biojava.utils.Unchangeable;
040
041/**
042 * <p>Implementation of SymbolTokenization which binds symbols to
043 * strings of characters. These tokenizations are intented to provide
044 *  alternate way of writing sequences into Strings.  Therefore they cannot be
045 * used for parsing files.</p>
046 *
047 * <p>As this release, alternate tokenizations are available for the built-in
048 * DNA alphabet (write symbols as capital letter) and PROTEIN-TERM alphabet
049 * (write symbol as triplets of characters with the first one being a capital
050 * letter as in "Glu".</p>
051 *
052 * <p>By convention, instances of AlternateTokenization should have an associated
053 * token starting by the word 'alternate'.
054 *
055 * @author George Waldon
056 * @since 1.5
057 */
058
059public class AlternateTokenization extends Unchangeable
060        implements SymbolTokenization, Serializable {
061    
062    private Alphabet alphabet;
063    private Map symbolsToStrings = new HashMap();
064    private boolean caseSensitive;
065    private boolean initiated = false;
066    private int width = 0;
067    
068    public AlternateTokenization(Alphabet alpha, boolean caseSensitive) {
069        alphabet = alpha;
070        this.caseSensitive = caseSensitive;
071    }
072    
073    public Alphabet getAlphabet() {
074        return alphabet;
075    }
076    
077    /** Tokens have fixed size.
078     */
079    public TokenType getTokenType() {
080        return FIXEDWIDTH;
081    }
082    
083    public Annotation getAnnotation() {
084        return Annotation.EMPTY_ANNOTATION;
085    }
086    
087    private synchronized void init(String str) {
088        if(initiated) return;
089        width = str.length();
090        initiated = true;
091    }
092    
093    /** Get the width of the tokens.
094     */
095    public int getWidth() {
096        if(initiated==false)
097            throw new IllegalStateException("Tokenization not initialize yet");
098        return width;
099    }
100    
101    /** Bind a Symbol to a string.
102     *
103     * @param s  the Symbol to bind
104     * @param str  the string to bind it to
105     */
106    public void bindSymbol(Symbol s, String str) {
107        if(!initiated)
108            init(str);
109        if(str.length()!=width)
110            throw new IllegalArgumentException("This tokenization must have all its tokens with the same size");
111        if (!symbolsToStrings.containsKey(s)) {
112            symbolsToStrings.put(s, str);
113        }
114    }
115    
116    /** Will throw an exception.
117     */
118    public Symbol parseToken(String token)
119    throws IllegalSymbolException {
120        throw new UnsupportedOperationException("AlternateTokenization are for writing only");
121    }
122    
123    public String tokenizeSymbol(Symbol s) throws IllegalSymbolException {
124        String str = (String) symbolsToStrings.get(s);
125        if (str == null) {
126            Alphabet alpha = getAlphabet();
127            alphabet.validate(s);
128            if (alpha instanceof FiniteAlphabet) {
129                str = (String) symbolsToStrings.get(AlphabetManager.getAllAmbiguitySymbol((FiniteAlphabet) alpha));
130            }
131            if (str == null) {
132                throw new IllegalSymbolException("No mapping for symbol " + s.getName());
133            }
134        }
135        return str;
136    }
137    
138    public String tokenizeSymbolList(SymbolList sl)
139    throws IllegalAlphabetException {
140        if (sl.getAlphabet() != getAlphabet()) {
141            throw new IllegalAlphabetException("Alphabet " + sl.getAlphabet().getName() + " does not match " + getAlphabet().getName());
142        }
143        StringBuffer sb = new StringBuffer();
144        for (Iterator i = sl.iterator(); i.hasNext(); ) {
145            Symbol sym = (Symbol) i.next();
146            try {
147                String str = tokenizeSymbol(sym);
148                sb.append(str);
149            } catch (IllegalSymbolException ex) {
150                throw new IllegalAlphabetException(ex, "Couldn't tokenize");
151            }
152        }
153        
154        return sb.substring(0);
155    }
156    
157    /** Will throw an exception.
158     */
159    public StreamParser parseStream(SeqIOListener listener) {
160        throw new UnsupportedOperationException("AlternateTokenization are for writing only");
161    }
162}