001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.ssbind; 023 024import java.util.HashMap; 025import java.util.Map; 026 027import org.biojava.bio.BioError; 028import org.biojava.bio.BioException; 029import org.biojava.bio.alignment.Alignment; 030import org.biojava.bio.alignment.SimpleAlignment; 031import org.biojava.bio.search.SearchContentHandler; 032import org.biojava.bio.seq.Sequence; 033import org.biojava.bio.seq.StrandedFeature; 034import org.biojava.bio.seq.StrandedFeature.Strand; 035import org.biojava.bio.seq.db.IllegalIDException; 036import org.biojava.bio.seq.homol.SimilarityPairFeature; 037import org.biojava.bio.seq.io.SymbolTokenization; 038import org.biojava.bio.symbol.FiniteAlphabet; 039import org.biojava.bio.symbol.RangeLocation; 040import org.biojava.bio.symbol.SimpleSymbolList; 041import org.biojava.utils.ChangeListener; 042import org.biojava.utils.ChangeType; 043import org.biojava.utils.ChangeVetoException; 044 045/** 046 * <p><code>SimilarityPairBuilder</code> annotates query and subject 047 * <code>Sequence</code> with <code>SimilarityPairFeature</code>s 048 * created from SAX events supplied via a 049 * <code>SeqSimilarityAdapter</code>. The objective is to describe a 050 * simple pairwise relationship between the two sequences. This 051 * differs slightly from using <code>HomologyFeature</code>s which are 052 * slightly heavier, have to contain a full alignment and don't have 053 * an explicit distinction between query and subject sequences in the 054 * alignment. The SAX events should describe elements conforming to 055 * the BioJava BlastLikeDataSetCollection DTD. Suitable sources are 056 * <code>BlastLikeSAXParser</code> or <code>FastaSAXParser</code>.</p> 057 * 058 * <p>Annotated <code>ViewSequence</code>s wrapping both query and 059 * subject sequences are created.</p> 060 * 061 * <p><strong>The current implementation should be used with care on 062 * streams containing more than one search output</strong>. This is 063 * because the builder will not stop after each report has been 064 * processed and as a result all the subject sequences get 065 * instantiated and a large object network could be created during 066 * processing.</p> 067 * 068 * @author Keith James 069 * @author Greg Cox 070 * @since 1.2 071 */ 072public class SimilarityPairBuilder extends ViewSequenceFactory 073 implements SearchContentHandler 074{ 075 /** 076 * Constant <code>SIMILARITY_PAIR_FEATURE_TYPE</code> the type 077 * String used by <code>SimilarityPairBuilder</code> when creating 078 * <code>SimilarityPairFeature</code>s. This is the String which 079 * is returned when a <code>SimilarityPairFeature</code>'s 080 * <code>getType()</code> method is called. 081 */ 082 public static final String SIMILARITY_PAIR_FEATURE_TYPE = "similarity"; 083 084 // Identifiers for query and database 085 private String queryID; 086 087 // Data holders for search result properties 088 private Map resultData; 089 private Map hitData; 090 private Map subHitData; 091 092 private SymbolTokenization tokenParser; 093 private StringBuffer tokenBuffer; 094 095 // Flag indicating whether there are more results in the stream 096 private boolean moreSearchesAvailable = false; 097 098 public SimilarityPairBuilder() 099 { 100 resultData = new HashMap(); 101 hitData = new HashMap(); 102 subHitData = new HashMap(); 103 queryViewCache = new HashMap(); 104 subjectViewCache = new HashMap(); 105 tokenBuffer = new StringBuffer(1024); 106 } 107 108 public Sequence getAnnotatedQuerySeq(String queryID) 109 throws IllegalIDException 110 { 111 if (! queryViewCache.containsKey(queryID)) 112 throw new IllegalIDException("Failed to retrieve annotated query sequence from cache using ID '" 113 + queryID 114 + "' (unknown ID"); 115 116 return (Sequence) queryViewCache.get(queryID); 117 } 118 119 public Sequence getAnnotatedSubjectSeq(String subjectID) 120 throws IllegalIDException 121 { 122 if (! subjectViewCache.containsKey(subjectID)) 123 throw new IllegalIDException("Failed to retrieve annotated subject sequence from cache using ID '" 124 + subjectID 125 + "' (unknown ID"); 126 127 return (Sequence) subjectViewCache.get(subjectID); 128 } 129 130 public void setQueryID(String queryID) 131 { 132 this.queryID = queryID; 133 } 134 135 public boolean getMoreSearches() 136 { 137 return moreSearchesAvailable; 138 } 139 140 public void setMoreSearches(boolean value) 141 { 142 moreSearchesAvailable = value; 143 } 144 145 public void startSearch() 146 { 147 subjectViewCache.clear(); 148 } 149 150 public void endSearch() { } 151 152 public void startHeader() 153 { 154 resultData.clear(); 155 } 156 157 public void endHeader() { } 158 159 public void startHit() 160 { 161 hitData.clear(); 162 subHitData.clear(); 163 } 164 165 public void endHit() { } 166 167 public void startSubHit() { } 168 169 public void endSubHit() 170 { 171 try 172 { 173 makeSimilarity(); 174 } 175 catch (BioException be) 176 { 177 System.err.println("Failed to build Similarity:"); 178 be.printStackTrace(); 179 } 180 } 181 182 public void addSearchProperty(Object key, Object value) 183 { 184 resultData.put(key, value); 185 } 186 187 public void addHitProperty(Object key, Object value) 188 { 189 hitData.put(key, value); 190 } 191 192 public void addSubHitProperty(Object key, Object value) 193 { 194 subHitData.put(key, value); 195 } 196 197 private void makeSimilarity() throws BioException 198 { 199 subHitData.putAll(resultData); 200 subHitData.putAll(hitData); 201 202 // Try to get a valid TokenParser 203 if (tokenParser == null) 204 { 205 String identifier; 206 // Try explicit sequence type first 207 if (subHitData.containsKey("hitSequenceType")) 208 identifier = (String) subHitData.get("hitSequenceType"); 209 // Otherwise try to resolve from the program name (only 210 // works for Blast) 211 else if (subHitData.containsKey("program")) 212 identifier = (String) subHitData.get("program"); 213 else 214 throw new BioException("Failed to determine sequence type"); 215 216 FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier); 217 tokenParser = alpha.getTokenization("token"); 218 } 219 220 // Set strands of hit on query and subject 221 Strand qStrand = StrandedFeature.POSITIVE; 222 Strand sStrand = StrandedFeature.POSITIVE; 223 224 // In cases where an explicit strand is given (FASTA DNA, BLASTN) 225 if (subHitData.containsKey("queryStrand") && 226 subHitData.get("queryStrand").equals("minus")) 227 qStrand = StrandedFeature.NEGATIVE; 228 229 if (subHitData.containsKey("subjectStrand") && 230 subHitData.get("subjectStrand").equals("minus")) 231 sStrand = StrandedFeature.NEGATIVE; 232 233 // In cases where a frame is given as this contains strand 234 // information (TBLASTN for hit, TBLASTX for both query and 235 // hit) 236 if (subHitData.containsKey("queryFrame") && 237 ((String) subHitData.get("queryFrame")).startsWith("minus")) 238 qStrand = StrandedFeature.NEGATIVE; 239 240 if (subHitData.containsKey("subjectFrame") && 241 ((String) subHitData.get("subjectFrame")).startsWith("minus")) 242 sStrand = StrandedFeature.NEGATIVE; 243 244 // Get start/end 245 int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart")); 246 int qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd")); 247 int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart")); 248 int sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd")); 249 250 // The start/end coordinates from BioJava XML don't follow the 251 // BioJava paradigm of start < end, with orientation given by 252 // the strand property. Rather, they present start/end as 253 // displayed in BLAST output, with the coordinates being 254 // inverted on the reverse strand. We account for this here. 255 if (qStrand == StrandedFeature.NEGATIVE) 256 { 257 int swap = qStart; 258 qStart = qEnd; 259 qEnd = swap; 260 } 261 262 if (sStrand == StrandedFeature.NEGATIVE) 263 { 264 int swap = sStart; 265 sStart = sEnd; 266 sEnd = swap; 267 } 268 269 Sequence queryView = makeQueryViewSequence(queryID); 270 271 // Map of Alignment sequences 272 Map labelMap = new HashMap(); 273 274 try 275 { 276 // Set source to the program name 277 String source = "unknown"; 278 if (subHitData.containsKey("program")) 279 source = (String) subHitData.get("program"); 280 281 tokenBuffer.setLength(0); 282 tokenBuffer.append((String) subHitData.get("querySequence")); 283 labelMap.put(SimilarityPairFeature.QUERY_LABEL, 284 new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); 285 286 tokenBuffer.setLength(0); 287 tokenBuffer.append((String) subHitData.get("subjectSequence")); 288 labelMap.put(SimilarityPairFeature.SUBJECT_LABEL, 289 new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); 290 291 double score = 0.0; 292 if (subHitData.containsKey("score")) 293 score = Double.parseDouble((String) subHitData.get("score")); 294 295 // Query sequence feature 296 SimilarityPairFeature.Template qt = 297 new SimilarityPairFeature.Template(); 298 qt.type = SIMILARITY_PAIR_FEATURE_TYPE; 299 qt.source = source; 300 qt.location = new RangeLocation(qStart, qEnd); 301 qt.strand = qStrand; 302 qt.score = score; 303 qt.annotation = AnnotationFactory.makeAnnotation(subHitData); 304 305 // Subject sequence feature 306 SimilarityPairFeature.Template st = 307 new SimilarityPairFeature.Template(); 308 st.type = SIMILARITY_PAIR_FEATURE_TYPE; 309 st.source = source; 310 st.location = new RangeLocation(sStart, sEnd); 311 st.strand = sStrand; 312 st.score = score; 313 st.annotation = AnnotationFactory.makeAnnotation(subHitData); 314 315 Alignment a = new SimpleAlignment(labelMap); 316 qt.alignment = a; 317 st.alignment = a; 318 319 SimilarityPairFeature qf = 320 (SimilarityPairFeature) queryView.createFeature(qt); 321 322 SimilarityPairFeature sf = 323 (SimilarityPairFeature) queryView.createFeature(qt); 324 325 sf.setSibling(qf); 326 qf.setSibling(sf); 327 328 qf.addChangeListener(ChangeListener.ALWAYS_VETO, 329 ChangeType.UNKNOWN); 330 sf.addChangeListener(ChangeListener.ALWAYS_VETO, 331 ChangeType.UNKNOWN); 332 } 333 catch (ChangeVetoException cve) 334 { 335 throw new BioError("Assertion failure creating " 336 + "SimilarityPairFeature. Template " 337 + "modification vetoed",cve); 338 } 339 } 340}