001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.seq.io; 024 025import java.util.ArrayList; 026import java.util.HashSet; 027import java.util.Iterator; 028import java.util.List; 029import java.util.Set; 030 031import org.biojava.bio.BioException; 032import org.biojava.bio.symbol.Alphabet; 033import org.biojava.bio.symbol.BasisSymbol; 034import org.biojava.bio.symbol.FiniteAlphabet; 035import org.biojava.bio.symbol.IllegalSymbolException; 036import org.biojava.bio.symbol.Symbol; 037 038/** 039 * Tokenization for cross-product alphabets. This class handles 040 * the general case of tokens of the form (foo bar baz), where 041 * each element is handled by a sub-tokenization. By default, 042 * these will be the "name" tokenizations of each of the sub-alphabets, 043 * but any tokenization can be used. 044 * 045 * @author Thomas Down 046 * @author Greg Cox 047 * @since 1.2 048 */ 049 050public class CrossProductTokenization extends WordTokenization { 051 private List subTokenizations; // List<SymbolTokenization> 052 053 public CrossProductTokenization(Alphabet alpha) 054 throws BioException 055 { 056 super(alpha); 057 subTokenizations = new ArrayList(); 058 for (Iterator i = alpha.getAlphabets().iterator(); i.hasNext(); ) { 059 Alphabet subAlpha = (Alphabet) i.next(); 060 subTokenizations.add(subAlpha.getTokenization("name")); 061 } 062 } 063 064 public CrossProductTokenization(Alphabet alpha, 065 List tokenizers) 066 { 067 super(alpha); 068 this.subTokenizations = tokenizers; 069 // Ought to validate... 070 } 071 072 public Symbol parseToken(String token) 073 throws IllegalSymbolException 074 { 075 char c = token.charAt(0); 076 if (c == '(') { 077 if (token.charAt(token.length() - 1) != ')') { 078 throw new IllegalSymbolException("Mismatched parentheses: " + token); 079 } else { 080 List split = splitString(token.substring(1, token.length() - 1)); 081 List syms = new ArrayList(); 082 083 Iterator si = split.iterator(); 084 Iterator ti = subTokenizations.iterator(); 085 while (si.hasNext()) { 086 String subToken = (String) si.next(); 087 SymbolTokenization subTokenization = (SymbolTokenization) ti.next(); 088 syms.add(subTokenization.parseToken(subToken)); 089 } 090 091 return getAlphabet().getSymbol(syms); 092 } 093 } else if (c == '[') { 094 if (token.charAt(token.length() - 1) != ']') { 095 throw new IllegalSymbolException("Mismatched parentheses: " + token); 096 } else { 097 Symbol[] syms = parseString(token.substring(1, token.length() - 1)); 098 Set ambigSet = new HashSet(); 099 for (int i = 0; i < syms.length; ++i) { 100 ambigSet.add(syms[i]); 101 } 102 return getAlphabet().getAmbiguity(ambigSet); 103 } 104 } else { 105 throw new IllegalSymbolException("Not in standard cross-product form: " + token); 106 } 107 } 108 109 public String tokenizeSymbol(Symbol s) throws IllegalSymbolException { 110 getAlphabet().validate(s); 111 112 if (s instanceof BasisSymbol) { 113 StringBuffer sb = new StringBuffer(); 114 sb.append('('); 115 Iterator si = ((BasisSymbol) s).getSymbols().iterator(); 116 Iterator ti = subTokenizations.iterator(); 117 118 while (si.hasNext()) { 119 Symbol subSym = (Symbol) si.next(); 120 SymbolTokenization subToke = (SymbolTokenization) ti.next(); 121 122 sb.append(subToke.tokenizeSymbol(subSym)); 123 if (si.hasNext()) { 124 sb.append(' '); 125 } 126 } 127 sb.append(')'); 128 return sb.substring(0); 129 } else { 130 StringBuffer sb = new StringBuffer(); 131 sb.append('['); 132 Iterator si = ((FiniteAlphabet) s.getMatches()).iterator(); 133 while (si.hasNext()) { 134 Symbol aSym = (Symbol) si.next(); 135 sb.append(tokenizeSymbol(aSym)); 136 if (si.hasNext()) { 137 sb.append(' '); 138 } 139 } 140 sb.append(']'); 141 return sb.substring(0); 142 } 143 } 144}