001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.util.HashMap; 025import java.util.Iterator; 026import java.util.Map; 027 028import org.biojava.bio.BioError; 029import org.biojava.bio.BioException; 030import org.biojava.bio.symbol.AlphabetManager; 031import org.biojava.bio.symbol.FiniteAlphabet; 032import org.biojava.bio.symbol.IllegalSymbolException; 033import org.biojava.bio.symbol.Symbol; 034import org.biojava.bio.symbol.SymbolList; 035 036/** 037 * <code>SymbolListCharSequence</code> is a <code>CharSequence</code> 038 * implementation which wraps a <code>SymbolList</code>. It is present 039 * primarily to support regular expression matching over 040 * <code>SymbolList</code>s as it avoids creating a copy. 041 * 042 * @author Keith James 043 * @author Matthew Pocock 044 * @since 1.3 045 */ 046public class SymbolListCharSequence implements CharSequence 047{ 048 private SymbolList syms; 049 private Map alphaTokens; 050 051 /** 052 * Creates a new <code>SymbolListCharSequence</code> wrapping a 053 * <code>SymbolList</code>. 054 * 055 * @param syms a <code>SymbolList</code>. 056 */ 057 public SymbolListCharSequence(SymbolList syms) 058 { 059 FiniteAlphabet alphabet = (FiniteAlphabet) syms.getAlphabet(); 060 if (! (alphabet instanceof FiniteAlphabet)) 061 throw new IllegalArgumentException("Only SymbolLists using a FiniteAlphabet are supported by SymbolListCharSequence"); 062 063 SymbolTokenization sToke = getTokenizer(alphabet, "token"); 064 if (sToke == null) 065 sToke = getTokenizer(alphabet, "unicode"); 066 if (sToke == null) 067 throw new BioError("unable to get a character tokenization for alphabet " + alphabet.getName()); 068 069 this.syms = syms; 070 alphaTokens = new HashMap(Math.round(alphabet.size() / 0.75f) + 1); 071 072 073 try 074 { 075 for (Iterator si = AlphabetManager.getAllSymbols(alphabet).iterator(); si.hasNext();) 076 { 077 Symbol s = (Symbol) si.next(); 078 char symChar = sToke.tokenizeSymbol(s).charAt(0); 079 alphaTokens.put(s, new Character(symChar)); 080 } 081 } 082 catch (IllegalSymbolException ise) 083 { 084 throw new BioError("Internal error: failed to tokenize a Symbol from an existing SymbolList", ise); 085 } 086 } 087 088 private SymbolTokenization getTokenizer(FiniteAlphabet alphabet, String tokenType) 089 { 090 SymbolTokenization sToke = null; 091 try 092 { 093 sToke = alphabet.getTokenization(tokenType); 094 } 095 catch (BioException be) 096 { 097 return null; 098 } 099 100 if (sToke.getTokenType() != SymbolTokenization.CHARACTER) 101 return null; 102 else 103 return sToke; 104 } 105 106 private SymbolListCharSequence(SymbolList syms, Map alphaTokens) 107 { 108 this.syms = syms; 109 this.alphaTokens = alphaTokens; 110 } 111 112 public char charAt(int index) 113 { 114 return ((Character) alphaTokens.get(syms.symbolAt(index + 1))).charValue(); 115 } 116 117 public int length() 118 { 119 return syms.length(); 120 } 121 122 public CharSequence subSequence(int start, int end) 123 { 124 return new SymbolListCharSequence(syms.subList(start + 1, end), 125 alphaTokens); 126 } 127 128 public String toString() 129 { 130 return syms.seqString(); 131 } 132}