001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.hmmer; 023 024import java.io.BufferedReader; 025import java.io.File; 026import java.io.FileReader; 027import java.util.Iterator; 028import java.util.StringTokenizer; 029 030import org.biojava.bio.dist.Distribution; 031import org.biojava.bio.dist.DistributionFactory; 032import org.biojava.bio.dp.DotState; 033import org.biojava.bio.dp.EmissionState; 034import org.biojava.bio.dp.State; 035import org.biojava.bio.seq.ProteinTools; 036import org.biojava.bio.seq.io.SymbolTokenization; 037import org.biojava.bio.symbol.Alphabet; 038import org.biojava.bio.symbol.FiniteAlphabet; 039import org.biojava.bio.symbol.Symbol; 040 041 042 043/** A class for parsing in Hmmer markov models from HMM_ls files generated by HMMER training 044 * note that this class is still currently experimental. 045 * @author Lachlan Coin 046 */ 047public class HmmerProfileParser{ 048 049 protected Alphabet alph = ProteinTools.getAlphabet(); 050 051 /** Returns a profile HMM representing the core HMMER hmm 052 * @param inputfile the file which contains the Profile HMM data, as output by HMMER - e.g. HMM_ls 053 */ 054 public static HmmerProfileHMM parse(File inputfile){ 055 HmmerProfileParser hmmP = new HmmerProfileParser(inputfile.toString()); 056 hmmP.parseModel(inputfile); 057 hmmP.setProfileHMM(); 058 return hmmP.getModel(); 059 } 060 061 /** Returns the full markov model - including the core model + J,C,N loop states. 062 * @param inputfile the file which contains the Profile HMM data, as output by HMMER - e.g. HMM_ls 063 */ 064 public static FullHmmerProfileHMM parseFull(File inputfile){ 065 HmmerProfileParser hmmP = new HmmerProfileParser(inputfile.toString()); 066 hmmP.parseModel(inputfile); 067 hmmP.setProfileHMM(); 068 hmmP.initialiseFullProfileHMM(); 069 hmmP.setFullProfileHMM(); 070 return hmmP.getFullModel(); 071 } 072 073 074 protected String domain1; 075 076 077 protected HmmerProfileParser(String domain){ 078 this.domain1 = domain; 079 } 080 081 protected HmmerProfileHMM initialiseProfileHMM(int len){ 082 try{ 083 DistributionFactory matchFactory = DistributionFactory.DEFAULT; 084 DistributionFactory insertFactory = DistributionFactory.DEFAULT; 085 return new HmmerProfileHMM(alph, len, matchFactory, insertFactory, domain1); 086 } 087 catch(Throwable t){ 088 t.printStackTrace(); 089 return null; 090 } 091 } 092 093 protected HmmerModel hmm; 094 private static final double sumCheckThreshold = 0.001; 095 096 public HmmerProfileHMM getModel(){ 097 return hmm.hmm; 098 } 099 100 101 FullHmmerProfileHMM getFullModel(){ 102 return hmm.hmm_full; 103 } 104 105 void initialiseFullProfileHMM(){ 106 hmm.initialiseFullProfileHMM(); 107 } 108 109 110 public void setProfileHMM(){ 111 hmm.setProfileHMM(); 112 } 113 114 void setFullProfileHMM(){ 115 hmm.setFullProfileHMM(); 116 } 117 118 119 public void parseModel(File inputFile){ 120 System.out.println("Parsing model "+inputFile); 121 try{ 122 BufferedReader in = new BufferedReader(new FileReader(inputFile)); 123 boolean inModel=false; 124 int seq_pos=1; 125 int rel_pos=0; 126 String s = new String(); 127 while((s = in.readLine())!= null){ 128 if(s.startsWith("//")) break; 129 if(!inModel){ 130 if(s.startsWith("LENG")) { 131 int[] a = parseString(s.substring(5),1); 132 hmm = new HmmerModel(a[0]); 133 } 134 else if(s.startsWith("NULE")) 135 hmm.setNullEmissions(s.substring(5)); 136 else if(s.startsWith("NULT")) 137 hmm.setNullTransitions(s.substring(5)); 138 else if(s.startsWith("XT")) 139 hmm.setSpecialTransitions(s.substring(5)); 140 else if(s.startsWith("HMM ")){ 141 inModel=true; 142 hmm.setAlphList(s.substring(7)); 143 in.readLine(); 144 hmm.setBeginTransition(in.readLine()); 145 } 146 } 147 else{ 148 if(rel_pos==0){ 149 hmm.setEmissions(s.substring(7), seq_pos); 150 } 151 else if(rel_pos==1 && seq_pos==1){ 152 hmm.setInsertEmissions(s.substring(7)); 153 } 154 else if(rel_pos==2){ 155 hmm.setTransitions(s.substring(7), seq_pos); 156 } 157 rel_pos++; 158 if(rel_pos==3){ 159 rel_pos=0; 160 seq_pos++; 161 } 162 } 163 } 164 in.close(); 165 } 166 catch(Throwable t){ 167 t.printStackTrace(); 168 } 169 } 170 171 172 173 174 175 176 static int[] parseString(String s, int len){ 177 String[] s1 = parseStringA(s,len); 178 int[] s2 = new int[len]; 179 for(int i=0; i<s1.length; i++){ 180 if(s1[i].indexOf("*")!= -1) s2[i] = Integer.MIN_VALUE; 181 else s2[i] = Integer.parseInt(s1[i]); 182 } 183 return s2; 184 } 185 186 static String[] parseStringA(String s, int len){ 187 String[] s2 = new String[len]; 188 StringTokenizer st = new StringTokenizer(s); 189 int i=0; 190 while(st.hasMoreTokens() && i<len){ 191 s2[i] = st.nextToken(); 192 i++; 193 } 194 return s2; 195 } 196 197 198/** An intermediate class to store the parsed data */ 199 class HmmerModel{ 200 /** Maps the null emission probabilities to symbols */ 201 int[] nullEmissions; 202 /** Maps the null transition probabilities */ 203 int[] nullTransitions; 204 205 /**A map of emission probabilities along the sequence. */ 206 int[][] emissions; 207 208 int[] insertEmissions; 209 210 211 int[][] transitions; 212 213 int[] beginTransition; 214 215 int[] specialTransitions; 216 217 Symbol[] alphList; 218; 219 HmmerProfileHMM hmm; 220 FullHmmerProfileHMM hmm_full; 221 222 HmmerModel(int length){ 223 System.out.println("Constructing base model"); 224 nullEmissions = new int[20]; 225 nullTransitions = new int[2]; 226 emissions = new int[length][20]; 227 insertEmissions = new int[20]; 228 specialTransitions = new int[8]; 229 transitions = new int[length+1][9]; 230 alphList = new Symbol[21]; 231 hmm = initialiseProfileHMM(emissions.length); 232 233 } 234 235 void setAlphList(String s){ 236 try{ 237 String[] list = parseStringA(s,20); 238 SymbolTokenization tokenizer = alph.getTokenization("token"); 239 for(int i=0; i<list.length;i++){ 240 alphList[i] = tokenizer.parseToken(list[i]); 241 } 242 alphList[list.length] = tokenizer.parseToken("U"); 243 } 244 catch(Throwable t){ 245 t.printStackTrace(); 246 } 247 } 248 249 void setEmissions(String s, int pos){ 250 emissions[pos-1] = parseString(s,20); 251 } 252 253 void setNullEmissions(String s){ 254 nullEmissions = parseString(s,20); 255 256 } 257 258 void setInsertEmissions(String s){ 259 insertEmissions = parseString(s,20); 260 } 261 262 void setNullTransitions(String s){ 263 nullTransitions = parseString(s,2); 264 } 265 266 void setTransitions(String s, int pos){ 267 transitions[pos] = parseString(s,9); 268 } 269 270 void setBeginTransition(String s){ 271 transitions[0]= parseString(s,3); 272 } 273 274 void setSpecialTransitions(String s){ 275 specialTransitions = parseString(s,8); 276 } 277 278 279 280 281 282 void initialiseFullProfileHMM(){ 283 try{ 284 hmm_full = new FullHmmerProfileHMM(hmm); 285 } 286 catch(Throwable t){ 287 t.printStackTrace(); 288 } 289 } 290 291 292 private void validateDistributionSum(Distribution dist) throws Exception{ 293 Iterator iter = ((FiniteAlphabet)dist.getAlphabet()).iterator(); 294 double sum=0.0; 295 while(iter.hasNext()){ 296 Symbol to = (Symbol) iter.next(); 297 sum += dist.getWeight(to); 298 //System.out.println(to.getName()+" "+dist.getWeight(to)); 299 } 300 //System.out.println("//"); 301 validateSum(sum); 302 } 303 304 private void addProbability(Distribution dist, State state, double prob) throws Exception{ 305 if(Double.isNaN(dist.getWeight(state))) 306 dist.setWeight(state,0); 307 Iterator iter = ((FiniteAlphabet)dist.getAlphabet()).iterator(); 308 while(iter.hasNext()){ 309 Symbol to = (Symbol) iter.next(); 310 double currentP = dist.getWeight(to); 311 dist.setWeight(to,currentP*(1-prob)); 312 } 313 dist.setWeight(state, dist.getWeight(state)+prob); 314 validateDistributionSum(dist); 315 } 316 317 private void checkTransitionSum() throws Exception{ 318 for (int i=0; i<=hmm.columns();i++){ 319 validateDistributionSum(hmm.getWeights(hmm.getMatch(i))); 320 if(i>0 && i<hmm.columns()){ 321 validateDistributionSum(hmm.getWeights(hmm.getInsert(i))); 322 validateDistributionSum(hmm.getWeights(hmm.getDelete(i))); 323 } 324 } 325 } 326 327 private void validateSum(double sum) throws Exception{ 328 if(Math.abs(sum-1.0)>sumCheckThreshold) 329 throw new Exception("Distribution does not sum to 1. Sums to "+sum); 330 } 331 332 /** Modifies HMM search for partial hits, by dividing probability by 2 at 333 * each point and adding transition to end state 334 */ 335 void addProfileHMMTransitions() throws Exception{ 336 for(int i=1; i<=hmm.columns(); i++){ 337 if(i>1){ 338 hmm.createTransition(hmm.magicalState(), hmm.getMatch(i)); 339 } 340 if(i<hmm.columns()){ 341 hmm.createTransition(hmm.getMatch(i), hmm.magicalState()); 342 } 343 } 344 } 345 346 347 /** Modifies HMM search for partial hits, by dividing probability by 2 at 348 * each point and adding transition to end state 349 */ 350 void modifyProbabilities() throws Exception{ 351 for(int i=1; i<=hmm.columns(); i++){ 352 if(i>1){ 353 addProbability(hmm.getWeights(hmm.magicalState()), hmm.getMatch(i), 0.5); 354 } 355 if(i<hmm.columns()){ 356 addProbability(hmm.getWeights(hmm.getMatch(i)), hmm.magicalState(), 0.5); 357 } 358 } 359 } 360 361 362 void setBeginEnd() throws Exception{ 363 Distribution dist = hmm.getWeights(hmm.magicalState()); 364 for(int i=1; i<=hmm.columns(); i++){ 365 EmissionState match = hmm.getMatch(i); 366 Distribution match_dist = hmm.getWeights(match); 367 dist.setWeight(match, convertToProb(transitions[i][7])); 368 match_dist.setWeight(hmm.magicalState(),convertToProb(transitions[i][8])); 369 } 370 } 371 372 void setFullProfileHMM(){ 373 try{ 374 Distribution dist = hmm_full.getWeights(hmm_full.magicalState()); 375 dist.setWeight(hmm_full.nState(), 1.0); 376 377 dist = hmm_full.getWeights(hmm_full.nState()); 378 dist.setWeight(hmm_full.hmm(), convertToProb(specialTransitions[0])); 379 dist.setWeight(hmm_full.nState(), convertToProb(specialTransitions[1])); 380 381 dist = hmm_full.getWeights(hmm_full.hmm()); 382 dist.setWeight(hmm_full.cState(), convertToProb(specialTransitions[2])); 383 dist.setWeight(hmm_full.jState(), convertToProb(specialTransitions[3])); 384 385 dist = hmm_full.getWeights(hmm_full.cState()); 386 dist.setWeight(hmm_full.magicalState(), convertToProb(specialTransitions[4])); 387 dist.setWeight(hmm_full.cState(), convertToProb(specialTransitions[5])); 388 389 dist = hmm_full.getWeights(hmm_full.jState()); 390 dist.setWeight(hmm_full.hmm(), convertToProb(specialTransitions[6])); 391 dist.setWeight(hmm_full.jState(), convertToProb(specialTransitions[7])); 392 } 393 catch(Throwable t){ 394 t.printStackTrace(); 395 } 396 } 397 398 399 void setProfileHMM(){ 400 try{ 401 for(int i=0; i<=hmm.columns(); i++){ 402 EmissionState match = hmm.getMatch(i); 403 Distribution dist = hmm.getWeights(match); 404 if(i<hmm.columns()){ 405 dist.setWeight(hmm.getMatch(i+1), 406 convertToProb(transitions[i][0])); 407 if(i>=1){ 408 dist.setWeight(hmm.getInsert(i), 409 convertToProb(transitions[i][1])); 410 } 411 dist.setWeight(hmm.getDelete(i+1), 412 convertToProb(transitions[i][2])); 413 } 414 else{ 415 dist.setWeight(hmm.magicalState(),1.0); 416 } 417 } 418 for(int i=1; i<hmm.columns(); i++){ 419 EmissionState insert = hmm.getInsert(i); 420 Distribution dist = hmm.getWeights(insert); 421 dist.setWeight(hmm.getMatch(i+1), 422 convertToProb(transitions[i][3])); 423 dist.setWeight(insert, 424 convertToProb(transitions[i][4])); 425 } 426 for(int i=1; i<hmm.columns(); i++){ 427 DotState delete = hmm.getDelete(i); 428 Distribution dist = hmm.getWeights(delete); 429 dist.setWeight(hmm.getMatch(i+1), 430 convertToProb(transitions[i][5])); 431 dist.setWeight(hmm.getDelete(i+1), 432 convertToProb(transitions[i][6])); 433 } 434 setBeginEnd(); 435 checkTransitionSum(); 436 // setting emission probabilities 437 438 Distribution insertEmission = hmm.getInsert(1).getDistribution(); 439 Distribution nullModel = DistributionFactory.DEFAULT.createDistribution(alph); 440 for (int j=0; j<alphList.length; j++){ 441 double prob; 442 double null_prob; 443 if(j<alphList.length-1){ 444 null_prob = convertToProb(nullEmissions[j],0.05); 445 prob = convertToProb(insertEmissions[j], null_prob); 446 447 } 448 else{ 449 prob = 0.0; 450 null_prob = 0.0; 451 } 452 insertEmission.setWeight(alphList[j],prob); 453 nullModel.setWeight(alphList[j],null_prob); 454 } 455 insertEmission.setNullModel(nullModel); 456 validateDistributionSum(insertEmission); 457 //System.out.println("NULL MODEL!!!!!"); 458 validateDistributionSum(nullModel); 459 460 for(int i=1; i<=hmm.columns(); i++){ 461 Distribution matchEmission = hmm.getMatch(i).getDistribution(); 462 if(i>1 && i<hmm.columns()) 463 hmm.getInsert(i).setDistribution(insertEmission); 464 for (int j=0; j<alphList.length; j++){ 465 double prob; 466 if(j<alphList.length-1){ 467 double null_prob = convertToProb(nullEmissions[j],0.05); 468 prob = convertToProb(emissions[i-1][j],null_prob); 469 } 470 else prob = 0.0; 471 matchEmission.setWeight(alphList[j],prob); 472 } 473 validateDistributionSum(matchEmission); 474 matchEmission.setNullModel(nullModel); 475 //System.out.println("NULL MODEL!!!!!"); 476 validateDistributionSum(matchEmission.getNullModel()); 477 } 478 //modifyProbabilities(); 479 } 480 catch(Throwable t){ 481 t.printStackTrace(); 482 } 483 } 484 485 private double convertToProb(int score){ 486 double result=0.0; 487 if(score!=Integer.MIN_VALUE){ 488 result = 1*Math.pow(2.0,((double) score/1000)); 489 } 490 return result; 491 } 492 493 private double convertToProb(int score, double nullprob){ 494 double result=0.0; 495 if(score!=Integer.MIN_VALUE){ 496 //double background = 0.05*Math.pow(2.0,((double) nullscore/1000)); 497 //result = background; 498 result = nullprob*Math.pow(2.0,((double) score/1000)); 499 //System.out.println(score+ " "+nullscore+" "+result); 500 } 501 return result; 502 } 503 504 505 } 506} 507 508 509 510 511