001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.ssbind; 023 024import java.util.ArrayList; 025import java.util.HashMap; 026import java.util.List; 027import java.util.Map; 028 029import org.biojava.bio.BioException; 030import org.biojava.bio.alignment.Alignment; 031import org.biojava.bio.alignment.SimpleAlignment; 032import org.biojava.bio.search.SearchContentHandler; 033import org.biojava.bio.seq.Sequence; 034import org.biojava.bio.seq.StrandedFeature; 035import org.biojava.bio.seq.StrandedFeature.Strand; 036import org.biojava.bio.seq.homol.Homology; 037import org.biojava.bio.seq.homol.HomologyFeature; 038import org.biojava.bio.seq.homol.SimpleHomology; 039import org.biojava.bio.seq.io.SymbolTokenization; 040import org.biojava.bio.symbol.FiniteAlphabet; 041import org.biojava.bio.symbol.RangeLocation; 042import org.biojava.bio.symbol.SimpleSymbolList; 043import org.biojava.utils.ChangeVetoException; 044 045/** 046 * <p><code>BlastLikeHomologyBuilder</code> populates a 047 * <code>List</code> with <code>Homology</code> instances created from 048 * SAX events supplied via a <code>SeqSimilarityAdapter</code>. The 049 * SAX events should describe elements conforming to the BioJava 050 * BlastLikeDataSetCollection DTD. Suitable sources are 051 * <code>BlastLikeSAXParser</code> or 052 * <code>FastaSAXParser</code>. Annotated <code>ViewSequence</code>s 053 * wrapping both query and subject sequences are created and populated 054 * with <code>HomologyFeature</code>s. See the documentation of 055 * <code>Homology</code> and <code>HomologyFeature</code>.</p> 056 * 057 * <p>As <code>SimpleHomologyFeature</code>s are created on views of 058 * the query and subject sequences, both query and subject should be 059 * nucleotide sequences (<code>SimpleHomologyFeature</code> extends 060 * <code>StrandedFeature</code>.). This limits the searches currently 061 * handled to BLASTN, TBLASTX and Fasta DNA.</p> 062 * 063 * @author Keith James 064 * @author Greg Cox 065 * @since 1.2 066 */ 067public class BlastLikeHomologyBuilder extends ViewSequenceFactory 068 implements SearchContentHandler 069{ 070 /** 071 * <code>HOMOLOGY_FEATURE_TYPE</code> is the type String used by 072 * <code>BlastLikeHomologyBuilder</code> when creating 073 * <code>HomologyFeature</code>s. This is the String which is 074 * returned when an <code>HomologyFeature</code>'s 075 * <code>getType()</code> method is called. 076 */ 077 public static final String HOMOLOGY_FEATURE_TYPE = "homology"; 078 079 // Identifiers for query and database 080 private String queryID; 081 082 // Data holders for search result properties 083 private Map resultData; 084 private Map hitData; 085 private Map subHitData; 086 087 private SymbolTokenization tokenParser; 088 private StringBuffer tokenBuffer; 089 090 // List for holding homologies from current search. There may be 091 // more than one search result in a stream 092 private List homologies; 093 // Flag indicating whether there are more results in the stream 094 private boolean moreSearchesAvailable = false; 095 // List to accept homologies from all results in the stream 096 private List target; 097 098 /** 099 * Creates a new <code>BlastLikeHomologyBuilder</code> which will 100 * instantiate <code>Homology</code> objects into the 101 * <code>List</code> target. 102 * 103 * @param target a <code>List</code>. 104 */ 105 public BlastLikeHomologyBuilder(List target) 106 { 107 this.target = target; 108 109 resultData = new HashMap(); 110 hitData = new HashMap(); 111 subHitData = new HashMap(); 112 queryViewCache = new HashMap(); 113 subjectViewCache = new HashMap(); 114 tokenBuffer = new StringBuffer(1024); 115 } 116 117 public void setQueryID(String queryID) 118 { 119 this.queryID = queryID; 120 } 121 122 public boolean getMoreSearches() 123 { 124 return moreSearchesAvailable; 125 } 126 127 public void setMoreSearches(boolean value) 128 { 129 moreSearchesAvailable = value; 130 } 131 132 public void startSearch() 133 { 134 subjectViewCache.clear(); 135 homologies = new ArrayList(); 136 } 137 138 public void endSearch() 139 { 140 target.addAll(homologies); 141 } 142 143 public void startHeader() 144 { 145 resultData.clear(); 146 } 147 148 public void endHeader() { } 149 150 public void startHit() 151 { 152 hitData.clear(); 153 subHitData.clear(); 154 } 155 156 public void endHit() { } 157 158 public void startSubHit() { } 159 160 public void endSubHit() 161 { 162 try 163 { 164 homologies.add(makeHomology()); 165 } 166 catch (BioException be) 167 { 168 System.err.println("Failed to build Homology:"); 169 be.printStackTrace(); 170 } 171 } 172 173 public void addSearchProperty(Object key, Object value) 174 { 175 resultData.put(key, value); 176 } 177 178 public void addHitProperty(Object key, Object value) 179 { 180 hitData.put(key, value); 181 } 182 183 public void addSubHitProperty(Object key, Object value) 184 { 185 subHitData.put(key, value); 186 } 187 188 /** 189 * <code>makeHomology</code> creates a new 190 * <code>SimpleHomology</code> describing the similarity between 191 * the query and subject sequences. The 192 * <code>HomologyFeatures</code> created are added to 193 * <code>ViewSequence</code>s wrapping the query and subject 194 * sequences. 195 * 196 * @return an <code>Homology</code>. 197 * 198 * @exception BioException if an error occurs. 199 */ 200 private Homology makeHomology() throws BioException 201 { 202 subHitData.putAll(resultData); 203 subHitData.putAll(hitData); 204 205 if (tokenParser == null) 206 { 207 String identifier; 208 // Try explicit sequence type first 209 if (subHitData.containsKey("hitSequenceType")) 210 identifier = (String) subHitData.get("hitSequenceType"); 211 // Otherwise try to resolve from the program name (only 212 // works for Blast) 213 else if (subHitData.containsKey("program")) 214 identifier = (String) subHitData.get("program"); 215 else 216 throw new BioException("Failed to determine sequence type"); 217 218 FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier); 219 tokenParser = alpha.getTokenization("token"); 220 } 221 222 Strand qStrand = StrandedFeature.POSITIVE; 223 Strand sStrand = StrandedFeature.POSITIVE; 224 225 // In cases where an explicit strand is given (FASTA DNA, BLASTN) 226 if (subHitData.containsKey("queryStrand") && 227 subHitData.get("queryStrand").equals("minus")) 228 qStrand = StrandedFeature.NEGATIVE; 229 230 if (subHitData.containsKey("subjectStrand") && 231 subHitData.get("subjectStrand").equals("minus")) 232 sStrand = StrandedFeature.NEGATIVE; 233 234 // In cases where a frame is given as this contains strand 235 // information (TBLASTN for hit, TBLASTX for both query and 236 // hit) 237 if (subHitData.containsKey("queryFrame") && 238 ((String) subHitData.get("queryFrame")).startsWith("minus")) 239 qStrand = StrandedFeature.NEGATIVE; 240 241 if (subHitData.containsKey("subjectFrame") && 242 ((String) subHitData.get("subjectFrame")).startsWith("minus")) 243 sStrand = StrandedFeature.NEGATIVE; 244 245 int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart")); 246 int qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd")); 247 int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart")); 248 int sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd")); 249 250 // The start/end coordinates from BioJava XML don't follow the 251 // BioJava paradigm of start < end, with orientation given by 252 // the strand property. Rather, they present start/end as 253 // displayed in BLAST output, with the coordinates being 254 // inverted on the reverse strand. We account for this here. 255 if (qStrand == StrandedFeature.NEGATIVE) 256 { 257 int swap = qStart; 258 qStart = qEnd; 259 qEnd = swap; 260 } 261 262 if (sStrand == StrandedFeature.NEGATIVE) 263 { 264 int swap = sStart; 265 sStart = sEnd; 266 sEnd = swap; 267 } 268 269 String subjectID = (String) hitData.get("subjectId"); 270 271 Sequence queryView = makeQueryViewSequence(queryID); 272 Sequence subjectView = makeSubjectViewSequence(subjectID); 273 274 SimpleHomology homology = new SimpleHomology(); 275 276 // Map of HomologyFeatures to Alignment sequences 277 Map labelMap = new HashMap(); 278 279 try 280 { 281 String source = "unknown"; 282 if (subHitData.containsKey("program")) 283 source = (String) subHitData.get("program"); 284 285 tokenBuffer.setLength(0); 286 tokenBuffer.append((String) subHitData.get("querySequence")); 287 288 // Query sequence feature 289 HomologyFeature.Template qt = new HomologyFeature.Template(); 290 qt.type = HOMOLOGY_FEATURE_TYPE; 291 qt.source = source; 292 qt.location = new RangeLocation(qStart, qEnd); 293 qt.strand = qStrand; 294 qt.annotation = AnnotationFactory.makeAnnotation(subHitData); 295 qt.homology = homology; 296 297 // Map the new feature to the alignment SymbolList 298 labelMap.put(queryView.createFeature(qt), 299 new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); 300 301 tokenBuffer.setLength(0); 302 tokenBuffer.append((String) subHitData.get("subjectSequence")); 303 304 // Subject sequence feature 305 HomologyFeature.Template st = new HomologyFeature.Template(); 306 st.type = HOMOLOGY_FEATURE_TYPE; 307 st.source = source; 308 st.location = new RangeLocation(sStart, sEnd); 309 st.strand = sStrand; 310 st.annotation = AnnotationFactory.makeAnnotation(subHitData); 311 st.homology = homology; 312 313 // Map the new feature to the alignment SymbolList 314 labelMap.put(subjectView.createFeature(st), 315 new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); 316 317 Alignment a = new SimpleAlignment(labelMap); 318 homology.setAlignment(a); 319 320 return homology; 321 } 322 catch (ChangeVetoException cve) 323 { 324 throw new BioException( "Failed to create HomologyFeature",cve); 325 } 326 } 327}