001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.symbol; 024 025import java.util.List; 026import java.util.NoSuchElementException; 027import java.util.Set; 028 029import org.biojava.bio.Annotatable; 030import org.biojava.bio.BioException; 031import org.biojava.bio.seq.io.SymbolTokenization; 032import org.biojava.utils.ChangeType; 033 034/** 035 * <p> 036 * The set of AtomicSymbols which can be concatenated together to make a 037 * SymbolList. 038 * </p> 039 * 040 * <p> 041 * A non-atomic symbol is considered to be contained within this alphabet if 042 * all of the atomic symbols that it could match are members of this alphabet. 043 * </p> 044 * 045 * @author Matthew Pocock 046 * @author Thomas Down 047 */ 048 049public interface Alphabet extends Annotatable { 050 /** 051 * <p> 052 * This ChangeType indicates that some symbols have been added or removed from 053 * the alphabet. The current and previous fields should indicate what symbols 054 * were there originally, and what they have been replaced with. 055 * <p> 056 * 057 * <p> 058 * If the alphabet wishes to propagate that the symbol has changed state, then 059 * previous and current should be null, but the chainedEvent property should 060 * rever to the ChangeEvent on the unerlying Symbol. 061 * </p> 062 */ 063 public static ChangeType SYMBOLS = new ChangeType( 064 "The set of symbols in this alphabet has changed.", 065 "org.biojava.bio.symbol.Alphabet", 066 "SYMBOLS" 067 ); 068 069 /** 070 * This signals that the available parsers have changed. If a parser is added, 071 * it will appear in getChanged(). If it is removed, it will appear in 072 * getPrevious(). 073 */ 074 public static ChangeType PARSERS = new ChangeType( 075 "The set of parsers has changed.", 076 "org.biojava.bio.symbol.Alphabet", 077 "PARSERS" 078 ); 079 080 /** 081 * Get the name of the alphabet. 082 * 083 * @return the name as a string. 084 */ 085 String getName(); 086 087 /** 088 * Return an ordered List of the alphabets which make up a 089 * compound alphabet. For simple alphabets, this will return 090 * a singleton list of itself. The returned list should be immutable. 091 * 092 * @return a List of alphabets 093 */ 094 List<Alphabet> getAlphabets(); 095 096 /** 097 * <p> 098 * Get a symbol from the Alphabet which corresponds 099 * to the specified ordered list of symbols. 100 * </p> 101 * 102 * <p> 103 * The symbol at i in the list must be a member of the i'th alphabet in 104 * getAlphabets. If all of the symbols in rl are atomic, then the resulting 105 * symbol will also be atomic. If any one of them is an ambiguity symbol then 106 * the resulting symbol will be the appropriate ambiguity symbol. 107 * </p> 108 * 109 * @param rl A list of Symbol instances 110 * @throws IllegalSymbolException if the members of rl are 111 * not Symbols over the alphabets returned from 112 * <code>getAlphabets</code> 113 */ 114 Symbol getSymbol(List<Symbol> rl) 115 throws IllegalSymbolException; 116 117 /** 118 * <p> 119 * Get a symbol that represents the set of symbols in syms. 120 * </p> 121 * 122 * <p> 123 * Syms must be a set of Symbol instances each of which is contained within 124 * this alphabet. This method is used to retrieve ambiguity symbols. 125 * </p> 126 * 127 * @param syms the Set of Symbols that will be found in getMatches of the 128 * returned symbol 129 * @return a Symbol (possibly fly-weighted) for the Set of symbols in syms 130 */ 131 Symbol getAmbiguity(Set<Symbol> syms) 132 throws IllegalSymbolException; 133 134 /** 135 * <p> 136 * Get the 'gap' ambiguity symbol that is most appropriate for this alphabet. 137 * </p> 138 * 139 * <p> 140 * In general, this will be a BasisSymbol that represents a list of 141 * AlphabetManager.getGapSymbol() the same length as the getAlphabets list. 142 * </p> 143 * 144 * @return the appropriate gap Symbol instance 145 */ 146 Symbol getGapSymbol(); 147 148 /** 149 * <p> 150 * Returns whether or not this Alphabet contains the symbol. 151 * </p> 152 * 153 * <p> 154 * An alphabet contains an ambiguity symbol iff the ambiguity symbol's 155 * getMatches() returns an alphabet that is a proper sub-set of this 156 * alphabet. That means that every one of the symbols that could match the 157 * ambiguity symbol is also a member of this alphabet. 158 * </p> 159 * 160 * @param s the Symbol to check 161 * @return boolean true if the Alphabet contains the symbol and false otherwise 162 */ 163 boolean contains(Symbol s); 164 165 /** 166 * <p> 167 * Throws a precanned IllegalSymbolException if the symbol is not contained 168 * within this Alphabet. 169 * </p> 170 * 171 * <p> 172 * This function is used all over the code to validate symbols as they enter 173 * a method. Also, the code is littered with catches for 174 * IllegalSymbolException. There is a preferred style of handling this, 175 * which should be covererd in the package documentation. 176 * </p> 177 * 178 * @param s the Symbol to validate 179 * @throws IllegalSymbolException if r is not contained in this alphabet 180 */ 181 void validate(Symbol s) throws IllegalSymbolException; 182 183 /** 184 * <p> 185 * Get a SymbolTokenization by name. 186 * </p> 187 * 188 * <p> 189 * The parser returned is guaranteed to return Symbols and SymbolLists that 190 * conform to this alphabet. 191 * </p> 192 * 193 * <p> 194 * Every alphabet should have a SymbolTokenzation under the name 'token' that 195 * uses the symbol token characters to translate a string into a 196 * SymbolList. Likewise, there should be a SymbolTokenization under the name 197 * 'name' that uses symbol names to identify symbols. Any other names may 198 * also be defined, but the behavior of the returned SymbolTokenization is 199 * not defined here. 200 * </p> 201 * <p> 202 * A SymbolTokenization under the name 'default' should be defined for all 203 * sequences, that determines the behavior when printing out a 204 * sequence. Standard behavior is to define the 'token' SymbolTokenization 205 * as default if it exists, else to define the 'name' SymbolTokenization as 206 * the default, but others are possible. 207 * </p> 208 * 209 * @param name the name of the parser 210 * @return a parser for that name 211 * @throws NoSuchElementException if the name is unknown 212 * @throws BioException if for any reason the tokenization could not be built 213 * @since 1.2 214 */ 215 216 public SymbolTokenization getTokenization(String name) throws BioException; 217 218 /** 219 * A really useful static alphabet that is always empty. 220 */ 221 static final FiniteAlphabet EMPTY_ALPHABET = new EmptyAlphabet(); 222}