001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.util.HashMap;
025import java.util.Iterator;
026import java.util.Map;
027
028import org.biojava.bio.BioError;
029import org.biojava.bio.BioException;
030import org.biojava.bio.symbol.AlphabetManager;
031import org.biojava.bio.symbol.FiniteAlphabet;
032import org.biojava.bio.symbol.IllegalSymbolException;
033import org.biojava.bio.symbol.Symbol;
034import org.biojava.bio.symbol.SymbolList;
035
036/**
037 * <code>SymbolListCharSequence</code> is a <code>CharSequence</code>
038 * implementation which wraps a <code>SymbolList</code>. It is present
039 * primarily to support regular expression matching over
040 * <code>SymbolList</code>s as it avoids creating a copy.
041 *
042 * @author Keith James
043 * @author Matthew Pocock
044 * @since 1.3
045 */
046public class SymbolListCharSequence implements CharSequence
047{
048    private SymbolList syms;
049    private Map alphaTokens;
050
051    /**
052     * Creates a new <code>SymbolListCharSequence</code> wrapping a
053     * <code>SymbolList</code>.
054     *
055     * @param syms a <code>SymbolList</code>.
056     */
057    public SymbolListCharSequence(SymbolList syms)
058    {
059        FiniteAlphabet alphabet = (FiniteAlphabet) syms.getAlphabet();
060        if (! (alphabet instanceof FiniteAlphabet))
061            throw new IllegalArgumentException("Only SymbolLists using a FiniteAlphabet are supported by SymbolListCharSequence");
062
063        SymbolTokenization sToke = getTokenizer(alphabet, "token");
064        if (sToke == null)
065            sToke = getTokenizer(alphabet, "unicode");
066        if (sToke == null)
067            throw new BioError("unable to get a character tokenization for alphabet " + alphabet.getName());
068
069        this.syms = syms;
070        alphaTokens = new HashMap(Math.round(alphabet.size() / 0.75f) + 1);
071
072
073        try
074        {
075            for (Iterator si = AlphabetManager.getAllSymbols(alphabet).iterator(); si.hasNext();)
076            {
077                Symbol s = (Symbol) si.next();
078                char symChar = sToke.tokenizeSymbol(s).charAt(0);
079                alphaTokens.put(s, new Character(symChar));
080            }
081        }
082        catch (IllegalSymbolException ise)
083        {
084            throw new BioError("Internal error: failed to tokenize a Symbol from an existing SymbolList", ise);
085        }
086    }
087
088    private SymbolTokenization getTokenizer(FiniteAlphabet alphabet, String tokenType)
089    {
090        SymbolTokenization sToke = null;
091        try
092        {
093            sToke = alphabet.getTokenization(tokenType);
094        }
095        catch (BioException be)
096        {
097            return null;
098        }
099
100        if (sToke.getTokenType() != SymbolTokenization.CHARACTER)
101            return null;
102        else
103            return sToke;
104    }
105
106    private SymbolListCharSequence(SymbolList syms, Map alphaTokens)
107    {
108        this.syms        = syms;
109        this.alphaTokens = alphaTokens;
110    }
111
112    public char charAt(int index)
113    {
114        return ((Character) alphaTokens.get(syms.symbolAt(index + 1))).charValue();
115    }
116
117    public int length()
118    {
119        return syms.length();
120    }
121
122    public CharSequence subSequence(int start, int end)
123    {
124        return new SymbolListCharSequence(syms.subList(start + 1, end),
125                                          alphaTokens);
126    }
127
128    public String toString()
129    {
130        return syms.seqString();
131    }
132}