001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.seq.io; 024 025import java.io.Serializable; 026import java.util.HashMap; 027import java.util.Iterator; 028import java.util.Map; 029 030import org.biojava.bio.Annotation; 031import org.biojava.bio.BioError; 032import org.biojava.bio.symbol.Alphabet; 033import org.biojava.bio.symbol.AlphabetManager; 034import org.biojava.bio.symbol.FiniteAlphabet; 035import org.biojava.bio.symbol.IllegalAlphabetException; 036import org.biojava.bio.symbol.IllegalSymbolException; 037import org.biojava.bio.symbol.Symbol; 038import org.biojava.bio.symbol.SymbolList; 039import org.biojava.utils.Unchangeable; 040 041/** 042 * <p>Implementation of SymbolTokenization which binds symbols to 043 * strings of characters. These tokenizations are intented to provide 044 * alternate way of writing sequences into Strings. Therefore they cannot be 045 * used for parsing files.</p> 046 * 047 * <p>As this release, alternate tokenizations are available for the built-in 048 * DNA alphabet (write symbols as capital letter) and PROTEIN-TERM alphabet 049 * (write symbol as triplets of characters with the first one being a capital 050 * letter as in "Glu".</p> 051 * 052 * <p>By convention, instances of AlternateTokenization should have an associated 053 * token starting by the word 'alternate'. 054 * 055 * @author George Waldon 056 * @since 1.5 057 */ 058 059public class AlternateTokenization extends Unchangeable 060 implements SymbolTokenization, Serializable { 061 062 private Alphabet alphabet; 063 private Map symbolsToStrings = new HashMap(); 064 private boolean caseSensitive; 065 private boolean initiated = false; 066 private int width = 0; 067 068 public AlternateTokenization(Alphabet alpha, boolean caseSensitive) { 069 alphabet = alpha; 070 this.caseSensitive = caseSensitive; 071 } 072 073 public Alphabet getAlphabet() { 074 return alphabet; 075 } 076 077 /** Tokens have fixed size. 078 */ 079 public TokenType getTokenType() { 080 return FIXEDWIDTH; 081 } 082 083 public Annotation getAnnotation() { 084 return Annotation.EMPTY_ANNOTATION; 085 } 086 087 private synchronized void init(String str) { 088 if(initiated) return; 089 width = str.length(); 090 initiated = true; 091 } 092 093 /** Get the width of the tokens. 094 */ 095 public int getWidth() { 096 if(initiated==false) 097 throw new IllegalStateException("Tokenization not initialize yet"); 098 return width; 099 } 100 101 /** Bind a Symbol to a string. 102 * 103 * @param s the Symbol to bind 104 * @param str the string to bind it to 105 */ 106 public void bindSymbol(Symbol s, String str) { 107 if(!initiated) 108 init(str); 109 if(str.length()!=width) 110 throw new IllegalArgumentException("This tokenization must have all its tokens with the same size"); 111 if (!symbolsToStrings.containsKey(s)) { 112 symbolsToStrings.put(s, str); 113 } 114 } 115 116 /** Will throw an exception. 117 */ 118 public Symbol parseToken(String token) 119 throws IllegalSymbolException { 120 throw new UnsupportedOperationException("AlternateTokenization are for writing only"); 121 } 122 123 public String tokenizeSymbol(Symbol s) throws IllegalSymbolException { 124 String str = (String) symbolsToStrings.get(s); 125 if (str == null) { 126 Alphabet alpha = getAlphabet(); 127 alphabet.validate(s); 128 if (alpha instanceof FiniteAlphabet) { 129 str = (String) symbolsToStrings.get(AlphabetManager.getAllAmbiguitySymbol((FiniteAlphabet) alpha)); 130 } 131 if (str == null) { 132 throw new IllegalSymbolException("No mapping for symbol " + s.getName()); 133 } 134 } 135 return str; 136 } 137 138 public String tokenizeSymbolList(SymbolList sl) 139 throws IllegalAlphabetException { 140 if (sl.getAlphabet() != getAlphabet()) { 141 throw new IllegalAlphabetException("Alphabet " + sl.getAlphabet().getName() + " does not match " + getAlphabet().getName()); 142 } 143 StringBuffer sb = new StringBuffer(); 144 for (Iterator i = sl.iterator(); i.hasNext(); ) { 145 Symbol sym = (Symbol) i.next(); 146 try { 147 String str = tokenizeSymbol(sym); 148 sb.append(str); 149 } catch (IllegalSymbolException ex) { 150 throw new IllegalAlphabetException(ex, "Couldn't tokenize"); 151 } 152 } 153 154 return sb.substring(0); 155 } 156 157 /** Will throw an exception. 158 */ 159 public StreamParser parseStream(SeqIOListener listener) { 160 throw new UnsupportedOperationException("AlternateTokenization are for writing only"); 161 } 162}