001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.seq.io; 024 025import java.io.Serializable; 026import java.util.ArrayList; 027import java.util.Iterator; 028import java.util.List; 029 030import org.biojava.bio.Annotation; 031import org.biojava.bio.symbol.Alphabet; 032import org.biojava.bio.symbol.IllegalAlphabetException; 033import org.biojava.bio.symbol.IllegalSymbolException; 034import org.biojava.bio.symbol.Symbol; 035import org.biojava.bio.symbol.SymbolList; 036import org.biojava.utils.Unchangeable; 037 038/** 039 * Base class for tokenizations which accept whitespace-separated 040 * `words'. Splits at whitespace, except when it is quoted by 041 * either double-quotes ("), brackets (), or square brackets []. 042 * 043 * @author Thomas Down 044 * @author Greg Cox 045 * @author Keith James 046 * @since 1.2 047 */ 048 049public abstract class WordTokenization 050 extends 051 Unchangeable 052 implements 053 SymbolTokenization, Serializable 054{ 055 private Alphabet alphabet; 056 057 public WordTokenization(Alphabet fab) { 058 this.alphabet = fab; 059 } 060 061 public Alphabet getAlphabet() { 062 return alphabet; 063 } 064 065 public TokenType getTokenType() { 066 return SEPARATED; 067 } 068 069 public Annotation getAnnotation() { 070 return Annotation.EMPTY_ANNOTATION; 071 } 072 073 public String tokenizeSymbolList(SymbolList sl) 074 throws IllegalSymbolException, IllegalAlphabetException 075 { 076 if (sl.getAlphabet() != getAlphabet()) { 077 throw new IllegalAlphabetException("Alphabet " + sl.getAlphabet().getName() + " does not match " + getAlphabet().getName()); 078 } 079 StringBuffer sb = new StringBuffer(); 080 Iterator i = sl.iterator(); 081 while (i.hasNext()) { 082 Symbol sym = (Symbol) i.next(); 083 sb.append(tokenizeSymbol(sym)); 084 if (i.hasNext()) { 085 sb.append(' '); 086 } 087 } 088 return sb.substring(0); 089 } 090 091 public StreamParser parseStream(SeqIOListener siol) { 092 return new WordStreamParser(siol); 093 } 094 095 protected List splitString(String str) 096 throws IllegalSymbolException 097 { 098 int ptr = 0; 099 List sl = new ArrayList(); 100 101 while (ptr < str.length()) { 102 char c = str.charAt(ptr); 103 if (Character.isWhitespace(c)) { 104 ++ptr; 105 } else if (c == '(') { 106 int nextPtr = findMatch(str, ptr, '(', ')'); 107 sl.add(str.substring(ptr, nextPtr)); 108 ptr = nextPtr; 109 } else if (c == '[') { 110 int nextPtr = findMatch(str, ptr, '[', ']'); 111 sl.add(str.substring(ptr, nextPtr)); 112 ptr = nextPtr; 113 } else { 114 int nextPtr = ptr; 115 char nc; 116 boolean quoted = false; 117 do { 118 nextPtr++; 119 if (nextPtr == str.length()) { 120 nc = ' '; 121 } else { 122 nc = str.charAt(nextPtr); 123 } 124 if (nc == '"') { 125 quoted = !quoted; 126 } 127 } while (!Character.isWhitespace(nc)); 128 129 sl.add(str.substring(ptr, nextPtr)); 130 ptr = nextPtr; 131 } 132 } 133 134 return sl; 135 } 136 137 protected Symbol[] parseString(String s) 138 throws IllegalSymbolException 139 { 140 List split = splitString(s); 141 Symbol[] syms = new Symbol[split.size()]; 142 for (int i = 0; i < split.size(); ++i) { 143 syms[i] = parseToken((String) split.get(i)); 144 } 145 return syms; 146 } 147 148 private class WordStreamParser implements StreamParser { 149 SeqIOListener listener; 150 StringBuffer sb = new StringBuffer(); 151 152 WordStreamParser(SeqIOListener l) { 153 listener = l; 154 } 155 156 public void characters(char[] data, int start, int len) { 157 sb.append(data, start, len); 158 } 159 160 public void close() 161 throws IllegalSymbolException 162 { 163 String str = sb.substring(0); 164 Symbol[] syms = parseString(str); 165 try { 166 listener.addSymbols(alphabet, syms, 0, syms.length); 167 } catch (IllegalAlphabetException ex) { 168 throw new IllegalSymbolException("Mismatched alphabets"); 169 } 170 } 171 } 172 173 private int findMatch(String str, int ptr, char openChar, char closeChar) { 174 int level = 0; 175 do { 176 char c = str.charAt(ptr++); 177 if (c == openChar) { 178 ++level; 179 } else if (c == closeChar) { 180 --level; 181 } 182 } while (level > 0); 183 return ptr; 184 } 185}