001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.hmmer;
023
024import java.io.BufferedReader;
025import java.io.File;
026import java.io.FileReader;
027import java.util.Iterator;
028import java.util.StringTokenizer;
029
030import org.biojava.bio.dist.Distribution;
031import org.biojava.bio.dist.DistributionFactory;
032import org.biojava.bio.dp.DotState;
033import org.biojava.bio.dp.EmissionState;
034import org.biojava.bio.dp.State;
035import org.biojava.bio.seq.ProteinTools;
036import org.biojava.bio.seq.io.SymbolTokenization;
037import org.biojava.bio.symbol.Alphabet;
038import org.biojava.bio.symbol.FiniteAlphabet;
039import org.biojava.bio.symbol.Symbol;
040
041
042
043/** A class for parsing in Hmmer markov models from HMM_ls files generated by HMMER training
044 * note that this class is still currently experimental. 
045 * @author Lachlan Coin
046 */
047public class HmmerProfileParser{
048
049                protected Alphabet alph = ProteinTools.getAlphabet();
050        
051    /** Returns a profile HMM representing the core HMMER hmm
052      * @param inputfile the file which contains the Profile HMM data, as output by HMMER - e.g. HMM_ls
053     */
054    public static HmmerProfileHMM parse(File inputfile){
055        HmmerProfileParser hmmP = new HmmerProfileParser(inputfile.toString());
056        hmmP.parseModel(inputfile);
057        hmmP.setProfileHMM();
058        return hmmP.getModel();
059    }
060
061    /** Returns the full markov model - including the core model + J,C,N loop states.
062     *  @param inputfile the file which contains the Profile HMM data, as output by HMMER - e.g. HMM_ls
063     */    
064    public static FullHmmerProfileHMM parseFull(File inputfile){
065        HmmerProfileParser hmmP = new HmmerProfileParser(inputfile.toString());
066        hmmP.parseModel(inputfile);
067        hmmP.setProfileHMM();
068        hmmP.initialiseFullProfileHMM();
069        hmmP.setFullProfileHMM();
070        return hmmP.getFullModel();
071    }
072
073   
074    protected String domain1;
075
076
077    protected HmmerProfileParser(String domain){
078        this.domain1 = domain;
079    }
080    
081    protected HmmerProfileHMM initialiseProfileHMM(int len){
082            try{
083            DistributionFactory matchFactory = DistributionFactory.DEFAULT;
084            DistributionFactory insertFactory = DistributionFactory.DEFAULT;
085            return new HmmerProfileHMM(alph, len, matchFactory, insertFactory, domain1);
086            }
087            catch(Throwable t){
088                t.printStackTrace();
089                return null;
090            }
091        }
092
093    protected HmmerModel hmm;
094    private static final double sumCheckThreshold = 0.001;
095
096    public HmmerProfileHMM getModel(){
097        return hmm.hmm;
098    }
099
100    
101    FullHmmerProfileHMM getFullModel(){
102        return hmm.hmm_full;
103    }
104    
105    void initialiseFullProfileHMM(){
106        hmm.initialiseFullProfileHMM();
107    }
108   
109
110     public void setProfileHMM(){
111        hmm.setProfileHMM();
112    }
113    
114     void  setFullProfileHMM(){
115        hmm.setFullProfileHMM();
116    }
117    
118
119     public void parseModel(File inputFile){
120        System.out.println("Parsing model "+inputFile);
121        try{
122            BufferedReader in = new BufferedReader(new FileReader(inputFile));
123            boolean inModel=false;
124            int seq_pos=1;
125            int rel_pos=0;
126            String s = new String();
127            while((s = in.readLine())!= null){
128                if(s.startsWith("//")) break;
129                if(!inModel){           
130                    if(s.startsWith("LENG")) {
131                        int[] a = parseString(s.substring(5),1);
132                        hmm = new HmmerModel(a[0]);
133                    }
134                    else if(s.startsWith("NULE"))
135                        hmm.setNullEmissions(s.substring(5));
136                    else if(s.startsWith("NULT"))
137                        hmm.setNullTransitions(s.substring(5));
138                    else if(s.startsWith("XT"))
139                        hmm.setSpecialTransitions(s.substring(5));
140                    else if(s.startsWith("HMM ")){
141                        inModel=true;
142                        hmm.setAlphList(s.substring(7));
143                        in.readLine();
144                        hmm.setBeginTransition(in.readLine());
145                    }
146                }
147                else{
148                    if(rel_pos==0){
149                        hmm.setEmissions(s.substring(7), seq_pos);
150                    }
151                    else if(rel_pos==1 && seq_pos==1){
152                        hmm.setInsertEmissions(s.substring(7));
153                    }
154                    else if(rel_pos==2){
155                        hmm.setTransitions(s.substring(7), seq_pos);
156                    }
157                    rel_pos++;
158                    if(rel_pos==3){
159                        rel_pos=0;
160                        seq_pos++;
161                    }
162                }                  
163            }
164            in.close();
165        }
166        catch(Throwable t){
167          t.printStackTrace();
168        }
169    }
170
171
172    
173
174
175
176    static int[] parseString(String s, int len){
177        String[] s1 = parseStringA(s,len);
178        int[] s2 = new int[len];
179        for(int i=0; i<s1.length; i++){
180            if(s1[i].indexOf("*")!= -1) s2[i] = Integer.MIN_VALUE;
181            else s2[i] = Integer.parseInt(s1[i]);
182        }
183        return s2;
184    }
185
186    static String[] parseStringA(String s, int len){
187        String[] s2 = new String[len];
188        StringTokenizer st = new StringTokenizer(s);
189        int i=0;
190        while(st.hasMoreTokens() && i<len){
191            s2[i] = st.nextToken(); 
192            i++;
193        }
194        return s2;
195    }
196
197
198/** An intermediate class to store the parsed data */
199    class HmmerModel{
200        /** Maps the null emission probabilities to symbols */
201        int[] nullEmissions;
202        /** Maps the null transition probabilities */
203        int[] nullTransitions;
204
205        /**A map of emission probabilities along the sequence. */
206        int[][] emissions;
207        
208        int[] insertEmissions;
209
210
211        int[][] transitions;
212        
213        int[] beginTransition;
214
215        int[] specialTransitions;
216
217        Symbol[] alphList;
218;
219        HmmerProfileHMM hmm;
220        FullHmmerProfileHMM hmm_full;
221
222        HmmerModel(int length){
223            System.out.println("Constructing base model");
224            nullEmissions = new int[20];
225            nullTransitions = new int[2];
226            emissions = new int[length][20];
227            insertEmissions = new int[20];
228            specialTransitions = new int[8];
229            transitions = new int[length+1][9];
230            alphList = new Symbol[21];
231            hmm = initialiseProfileHMM(emissions.length);
232
233        }
234
235        void setAlphList(String s){
236            try{
237                String[] list = parseStringA(s,20);
238                SymbolTokenization tokenizer = alph.getTokenization("token");
239                for(int i=0; i<list.length;i++){
240                    alphList[i] = tokenizer.parseToken(list[i]);
241                }
242                alphList[list.length] = tokenizer.parseToken("U");
243            }
244            catch(Throwable t){
245                t.printStackTrace();
246            }
247        }
248
249        void setEmissions(String s, int pos){
250            emissions[pos-1] = parseString(s,20);
251        }
252
253        void setNullEmissions(String s){
254            nullEmissions = parseString(s,20);
255
256        }
257
258        void setInsertEmissions(String s){
259            insertEmissions = parseString(s,20);
260        }
261
262        void setNullTransitions(String s){
263            nullTransitions = parseString(s,2);
264        }
265
266        void setTransitions(String s, int pos){
267           transitions[pos] =  parseString(s,9);
268        }
269        
270        void setBeginTransition(String s){
271            transitions[0]= parseString(s,3);
272        }
273
274        void setSpecialTransitions(String s){
275            specialTransitions = parseString(s,8);
276        }
277
278
279
280        
281        
282        void initialiseFullProfileHMM(){
283            try{
284                hmm_full = new FullHmmerProfileHMM(hmm);
285            }
286            catch(Throwable t){
287                t.printStackTrace();
288            }
289        }
290       
291
292        private void  validateDistributionSum(Distribution dist) throws Exception{
293            Iterator iter = ((FiniteAlphabet)dist.getAlphabet()).iterator();
294            double sum=0.0;
295            while(iter.hasNext()){
296                Symbol to = (Symbol) iter.next();
297                sum += dist.getWeight(to);
298                //System.out.println(to.getName()+" "+dist.getWeight(to));
299            }
300            //System.out.println("//");
301            validateSum(sum);
302        }
303
304        private void  addProbability(Distribution dist, State state, double prob) throws Exception{
305            if(Double.isNaN(dist.getWeight(state)))
306                dist.setWeight(state,0);
307            Iterator iter = ((FiniteAlphabet)dist.getAlphabet()).iterator();
308            while(iter.hasNext()){
309                Symbol to = (Symbol) iter.next();
310                double currentP = dist.getWeight(to);
311                dist.setWeight(to,currentP*(1-prob));
312            }
313            dist.setWeight(state, dist.getWeight(state)+prob);
314            validateDistributionSum(dist);
315        }
316
317        private void checkTransitionSum() throws Exception{
318            for (int i=0; i<=hmm.columns();i++){
319                validateDistributionSum(hmm.getWeights(hmm.getMatch(i)));
320                if(i>0 && i<hmm.columns()){
321                    validateDistributionSum(hmm.getWeights(hmm.getInsert(i)));
322                    validateDistributionSum(hmm.getWeights(hmm.getDelete(i)));
323                }
324            }
325        }
326
327        private void validateSum(double sum) throws Exception{
328            if(Math.abs(sum-1.0)>sumCheckThreshold)
329                throw new Exception("Distribution does not sum to 1.  Sums to "+sum);
330        }
331
332        /** Modifies HMM search for partial hits, by dividing probability by 2 at
333         *  each point and adding transition to end state
334         */
335        void addProfileHMMTransitions() throws Exception{
336            for(int i=1; i<=hmm.columns(); i++){
337                if(i>1){
338                    hmm.createTransition(hmm.magicalState(), hmm.getMatch(i));
339                }
340                if(i<hmm.columns()){
341                    hmm.createTransition(hmm.getMatch(i), hmm.magicalState());
342                }
343            }
344        }
345
346
347        /** Modifies HMM search for partial hits, by dividing probability by 2 at
348         *  each point and adding transition to end state
349         */
350        void modifyProbabilities() throws Exception{
351            for(int i=1; i<=hmm.columns(); i++){
352                if(i>1){
353                    addProbability(hmm.getWeights(hmm.magicalState()), hmm.getMatch(i), 0.5);
354                }
355                if(i<hmm.columns()){
356                    addProbability(hmm.getWeights(hmm.getMatch(i)), hmm.magicalState(), 0.5);
357                }
358            }
359        }
360            
361
362        void setBeginEnd() throws Exception{
363            Distribution dist = hmm.getWeights(hmm.magicalState());
364            for(int i=1; i<=hmm.columns(); i++){
365                EmissionState match = hmm.getMatch(i);
366                Distribution match_dist = hmm.getWeights(match);
367                dist.setWeight(match, convertToProb(transitions[i][7]));
368                match_dist.setWeight(hmm.magicalState(),convertToProb(transitions[i][8]));
369            }
370        }
371        
372        void setFullProfileHMM(){
373            try{
374                Distribution dist = hmm_full.getWeights(hmm_full.magicalState());
375                dist.setWeight(hmm_full.nState(), 1.0);
376
377                dist = hmm_full.getWeights(hmm_full.nState());
378                dist.setWeight(hmm_full.hmm(), convertToProb(specialTransitions[0]));
379                dist.setWeight(hmm_full.nState(), convertToProb(specialTransitions[1]));
380
381                dist = hmm_full.getWeights(hmm_full.hmm());
382                dist.setWeight(hmm_full.cState(), convertToProb(specialTransitions[2]));
383                dist.setWeight(hmm_full.jState(), convertToProb(specialTransitions[3]));
384
385                dist = hmm_full.getWeights(hmm_full.cState());
386                dist.setWeight(hmm_full.magicalState(), convertToProb(specialTransitions[4]));
387                dist.setWeight(hmm_full.cState(), convertToProb(specialTransitions[5]));
388
389                dist = hmm_full.getWeights(hmm_full.jState());
390                dist.setWeight(hmm_full.hmm(), convertToProb(specialTransitions[6]));
391                dist.setWeight(hmm_full.jState(), convertToProb(specialTransitions[7]));
392            }
393            catch(Throwable t){
394                t.printStackTrace();
395            }
396        }
397
398
399        void setProfileHMM(){
400            try{
401            for(int i=0; i<=hmm.columns(); i++){
402                EmissionState match = hmm.getMatch(i);
403                Distribution dist = hmm.getWeights(match);
404                if(i<hmm.columns()){
405                    dist.setWeight(hmm.getMatch(i+1), 
406                                   convertToProb(transitions[i][0]));
407                    if(i>=1){
408                        dist.setWeight(hmm.getInsert(i), 
409                                       convertToProb(transitions[i][1]));
410                    }
411                    dist.setWeight(hmm.getDelete(i+1), 
412                                   convertToProb(transitions[i][2]));
413                }
414                else{
415                    dist.setWeight(hmm.magicalState(),1.0);
416                }
417            }
418            for(int i=1; i<hmm.columns(); i++){
419                EmissionState insert = hmm.getInsert(i);
420                Distribution dist = hmm.getWeights(insert);
421                dist.setWeight(hmm.getMatch(i+1), 
422                               convertToProb(transitions[i][3]));
423                dist.setWeight(insert, 
424                               convertToProb(transitions[i][4]));
425            }
426            for(int i=1; i<hmm.columns(); i++){
427                DotState delete = hmm.getDelete(i);
428                Distribution dist = hmm.getWeights(delete);
429                dist.setWeight(hmm.getMatch(i+1), 
430                               convertToProb(transitions[i][5]));
431                dist.setWeight(hmm.getDelete(i+1), 
432                               convertToProb(transitions[i][6]));
433            }
434            setBeginEnd();
435            checkTransitionSum();
436            // setting emission probabilities
437
438            Distribution insertEmission = hmm.getInsert(1).getDistribution();
439            Distribution nullModel = DistributionFactory.DEFAULT.createDistribution(alph);
440            for (int j=0; j<alphList.length; j++){
441                double prob;
442                double null_prob;
443                if(j<alphList.length-1){
444            null_prob = convertToProb(nullEmissions[j],0.05);   
445            prob = convertToProb(insertEmissions[j], null_prob);
446        
447                }
448                else{
449                    prob  = 0.0;
450                    null_prob = 0.0;
451                }
452                insertEmission.setWeight(alphList[j],prob);
453                nullModel.setWeight(alphList[j],null_prob);
454            }
455            insertEmission.setNullModel(nullModel);
456            validateDistributionSum(insertEmission);
457            //System.out.println("NULL MODEL!!!!!");
458            validateDistributionSum(nullModel);
459
460            for(int i=1; i<=hmm.columns(); i++){
461                Distribution matchEmission = hmm.getMatch(i).getDistribution();
462                if(i>1 && i<hmm.columns())
463                    hmm.getInsert(i).setDistribution(insertEmission);
464                for (int j=0; j<alphList.length; j++){
465                double prob;
466                if(j<alphList.length-1){
467                        double  null_prob = convertToProb(nullEmissions[j],0.05);
468                    prob = convertToProb(emissions[i-1][j],null_prob);
469                }
470                else prob  = 0.0;
471                    matchEmission.setWeight(alphList[j],prob);
472                }
473                validateDistributionSum(matchEmission);
474                matchEmission.setNullModel(nullModel);
475                //System.out.println("NULL MODEL!!!!!");
476                validateDistributionSum(matchEmission.getNullModel());
477            }
478            //modifyProbabilities();
479            }
480             catch(Throwable t){
481                t.printStackTrace();
482            }
483        }           
484
485        private double convertToProb(int score){
486            double result=0.0;
487            if(score!=Integer.MIN_VALUE){
488                result = 1*Math.pow(2.0,((double) score/1000));
489            }
490            return result;
491        }
492
493        private double convertToProb(int score, double nullprob){
494            double result=0.0;
495            if(score!=Integer.MIN_VALUE){
496                //double background = 0.05*Math.pow(2.0,((double) nullscore/1000));
497                //result = background;
498                result = nullprob*Math.pow(2.0,((double) score/1000));
499                //System.out.println(score+ " "+nullscore+" "+result);
500            }
501            return result;
502        }
503
504
505    }
506}
507
508
509
510        
511