001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.symbol;
024
025import java.util.List;
026import java.util.NoSuchElementException;
027import java.util.Set;
028
029import org.biojava.bio.Annotatable;
030import org.biojava.bio.BioException;
031import org.biojava.bio.seq.io.SymbolTokenization;
032import org.biojava.utils.ChangeType;
033
034/**
035 * <p>
036 * The set of AtomicSymbols which can be concatenated together to make a
037 * SymbolList.
038 * </p>
039 *
040 * <p>
041 * A non-atomic symbol is considered to be contained within this alphabet if
042 * all of the atomic symbols that it could match are members of this alphabet.
043 * </p>
044 *
045 * @author Matthew Pocock
046 * @author Thomas Down
047 */
048 
049public interface Alphabet extends Annotatable {
050  /** 
051   * <p>
052   * This ChangeType indicates that some symbols have been added or removed from
053   * the alphabet. The current and previous fields should indicate what symbols
054   * were there originally, and what they have been replaced with.
055   * <p>
056   *
057   * <p>
058   * If the alphabet wishes to propagate that the symbol has changed state, then
059   * previous and current should be null, but the chainedEvent property should
060   * rever to the ChangeEvent on the unerlying Symbol.
061   * </p>
062   */
063  public static ChangeType SYMBOLS = new ChangeType(
064    "The set of symbols in this alphabet has changed.",
065    "org.biojava.bio.symbol.Alphabet",
066    "SYMBOLS"
067  );
068  
069  /**
070   * This signals that the available parsers have changed. If a parser is added,
071   * it will appear in getChanged(). If it is removed, it will appear in
072   * getPrevious().
073   */
074  public static ChangeType PARSERS = new ChangeType(
075    "The set of parsers has changed.",
076    "org.biojava.bio.symbol.Alphabet",
077    "PARSERS"
078  );
079  
080  /**
081   * Get the name of the alphabet.
082   *
083   * @return  the name as a string.
084   */
085  String getName();
086
087  /**
088   * Return an ordered List of the alphabets which make up a
089   * compound alphabet.  For simple alphabets, this will return
090   * a singleton list of itself. The returned list should be immutable.
091   *
092   * @return a List of alphabets
093   */
094  List<Alphabet> getAlphabets();
095
096  /**
097   * <p>
098   * Get a symbol from the Alphabet which corresponds
099   * to the specified ordered list of symbols.
100   * </p>
101   *
102   * <p>
103   * The symbol at i in the list must be a member of the i'th alphabet in
104   * getAlphabets. If all of the symbols in rl are atomic, then the resulting
105   * symbol will also be atomic. If any one of them is an ambiguity symbol then
106   * the resulting symbol will be the appropriate ambiguity symbol.
107   * </p>
108   *
109   * @param rl A list of Symbol instances
110   * @throws IllegalSymbolException if the members of rl are
111   *            not Symbols over the alphabets returned from
112   *            <code>getAlphabets</code>
113   */
114  Symbol getSymbol(List<Symbol> rl) 
115    throws IllegalSymbolException;
116
117  /**
118   * <p>
119   * Get a symbol that represents the set of symbols in syms.
120   * </p>
121   *
122   * <p>
123   * Syms must be a set of Symbol instances each of which is contained within
124   * this alphabet. This method is used to retrieve ambiguity symbols.
125   * </p>
126   *
127   * @param syms  the Set of Symbols that will be found in getMatches of the
128   *            returned symbol
129   * @return a Symbol (possibly fly-weighted) for the Set of symbols in syms
130   */
131  Symbol getAmbiguity(Set<Symbol> syms)
132  throws IllegalSymbolException;
133  
134  /**
135   * <p>
136   * Get the 'gap' ambiguity symbol that is most appropriate for this alphabet.
137   * </p>
138   *
139   * <p>
140   * In general, this will be a BasisSymbol that represents a list of
141   * AlphabetManager.getGapSymbol() the same length as the getAlphabets list.
142   * </p>
143   *
144   * @return the appropriate gap Symbol instance
145   */
146  Symbol getGapSymbol();
147  
148  /**
149   * <p>
150   * Returns whether or not this Alphabet contains the symbol.
151   * </p>
152   *
153   * <p>
154   * An alphabet contains an ambiguity symbol iff the ambiguity symbol's
155   * getMatches() returns an alphabet that is a proper sub-set of this
156   * alphabet. That means that every one of the symbols that could match the
157   * ambiguity symbol is also a member of this alphabet.
158   * </p>
159   *
160   * @param s the Symbol to check
161   * @return  boolean true if the Alphabet contains the symbol and false otherwise
162   */
163  boolean contains(Symbol s);
164
165  /**
166   * <p>
167   * Throws a precanned IllegalSymbolException if the symbol is not contained
168   * within this Alphabet.
169   * </p>
170   *
171   * <p>
172   * This function is used all over the code to validate symbols as they enter
173   * a method. Also, the code is littered with catches for
174   * IllegalSymbolException. There is a preferred style of handling this,
175   * which should be covererd in the package documentation.
176   * </p>
177   *
178   * @param s the Symbol to validate
179   * @throws  IllegalSymbolException if r is not contained in this alphabet
180   */
181  void validate(Symbol s) throws IllegalSymbolException;
182  
183  /**
184   * <p>
185   * Get a SymbolTokenization by name.
186   * </p>
187   *
188   * <p>
189   * The parser returned is guaranteed to return Symbols and SymbolLists that
190   * conform to this alphabet.
191   * </p>
192   *
193   * <p>
194   * Every alphabet should have a SymbolTokenzation under the name 'token' that
195   * uses the symbol token characters to translate a string into a
196   * SymbolList. Likewise, there should be a SymbolTokenization under the name
197   * 'name' that uses symbol names to identify symbols. Any other names may
198   * also be defined, but the behavior of the returned SymbolTokenization is
199   * not defined here.
200   * </p>
201   * <p>
202   * A SymbolTokenization under the name 'default' should be defined for all
203   * sequences, that determines the behavior when printing out a
204   * sequence. Standard behavior is to define the 'token' SymbolTokenization
205   * as default if it exists, else to define the 'name' SymbolTokenization as
206   * the default, but others are possible.
207   * </p>
208   *
209   * @param name  the name of the parser
210   * @return  a parser for that name
211   * @throws NoSuchElementException if the name is unknown
212   * @throws BioException if for any reason the tokenization could not be built
213   * @since 1.2
214   */
215    
216    public SymbolTokenization getTokenization(String name) throws BioException;
217  
218  /**
219   * A really useful static alphabet that is always empty.
220   */
221  static final FiniteAlphabet EMPTY_ALPHABET = new EmptyAlphabet();
222}