001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.program.sax;
022
023import java.io.BufferedReader;
024import java.io.IOException;
025import java.util.ArrayList;
026
027import org.xml.sax.Attributes;
028import org.xml.sax.InputSource;
029import org.xml.sax.SAXException;
030import org.xml.sax.helpers.AttributesImpl;
031
032/**
033 * A SAX2 parser for dealing with native PDB files.  That is,
034 * this class allows native PDB format files to be processed
035 * as if they were in PdbXML format, but without an interconversion
036 * step.  That is, events are generated that call methods
037 * on an XML document handler.
038 * <p>
039 * <b>Note this code is experimental, and may change without notice.
040 *
041 * </b>
042 * <p>
043 *
044 * Copyright &copy; 2000 - 2002 Cambridge Antibody Technology.
045 * 
046 * <p>
047 * Primary author -<ul>
048 * <li>Simon Brocklehurst (CAT)
049 * </ul>
050 * Other authors  -<ul>
051 * <li>Neil Benn          (CAT)
052 * <li>Derek Crockford    (CAT)
053 * <li>Tim Dilks          (CAT)
054 * <li>Colin Hardman      (CAT)
055 * <li>Stuart Johnston    (CAT)
056 *</ul>
057 *
058 *
059 * @author Cambridge Antibody Technology (CAT)
060 * @author Greg Cox
061 * @version 0.8
062 *
063 */
064public class PdbSAXParser extends AbstractNativeAppSAXParser {
065
066
067    private ArrayList        oRecordList = new ArrayList();
068    private String           oRecord;
069    private int              iPos;
070    private int              iModelStart;
071    private int              iModelStop;
072
073    private AttributesImpl          oAtts     = new AttributesImpl();
074    private QName                   oAttQName = new QName(this);
075
076    /**
077     * Sets namespace prefix to "biojava"
078     */
079    public PdbSAXParser() {
080        this.setNamespacePrefix("biojava");
081    }
082
083    public void parse (String poURI) throws IOException,SAXException {
084        this.parse(new InputSource(poURI));
085    }
086    /**
087     * Describe 'parse' method here.
088     *
089     * @param poSource   -
090     */
091    public void parse(InputSource poSource )
092        throws IOException,SAXException {
093
094        BufferedReader            oContents;
095        String                    oLine = null;
096
097        //Use method form superclass
098        oContents = this.getContentStream(poSource);
099
100
101        try {
102            // loop over file
103            oLine = oContents.readLine();
104            while (oLine != null) {
105                String oPadLine = this.padLine(oLine);
106                // put padded line into ArrayList
107                oRecordList.add(oPadLine);
108                //System.out.println(oLine);
109                oLine = oContents.readLine();
110            } // end while
111
112            //-----------------------
113
114            //At this point, have the entire raw file in core memory.
115            //Now parse it and fire of relevant events
116
117            //First preprocess file
118
119            //Rule
120            //If there are no model records, then insert records
121            //for a single model.  MODEL record before first ATOM,
122            //ENDMDL and before, CONECT, MASTER, END
123
124            boolean tIsModel = false;
125
126            for (int i = 0; i < oRecordList.size(); i++) {
127                oRecord = (String)oRecordList.get(i);
128                if (oRecord.startsWith("MODEL")) {
129                    tIsModel = true;
130                    break;
131                }
132            }
133
134            boolean tFoundFirstAtom = false;
135            if (!tIsModel) {
136                //System.out.println("No MODEL records");
137                for (int i = 0; i < oRecordList.size(); i++) {
138                    oRecord = (String)oRecordList.get(i);
139
140                    if ( ((oRecord.startsWith("ATOM  ")) ||
141                          (oRecord.startsWith("HETATM")))  &&
142                        (!tFoundFirstAtom))             {
143                        tFoundFirstAtom = true;
144
145                        //System.out.println("Found first atom>"+i+"<");
146
147                        oRecordList.add(i,"MODEL        1");
148                        break;
149                    }
150                }
151
152                boolean tFoundLastAtom = false;
153                for (int i = oRecordList.size() - 1; i > 0; i--) {
154                    oRecord = (String)oRecordList.get(i);
155
156                    if ( ((oRecord.startsWith("ATOM  ")) ||
157                          (oRecord.startsWith("HETATM")) ||
158                          (oRecord.startsWith("TER")  )) &&
159                         (!tFoundLastAtom))                 {
160
161                        tFoundLastAtom = true;
162
163                        //System.out.println("Found last atom>"+i+"<");
164
165                        oRecordList.add(i+1,"ENDMDL");
166                        break;
167                    }
168                }
169
170            } //end if tIsModel == false
171
172
173            //End preprocess file
174
175            //At this point, the PDB records should be
176            //in a suitable state for parsing...
177
178
179            oAtts.clear();
180            this.startElement(new QName(this,
181                                        this.prefix("MolecularStructureList")),
182                              (Attributes)oAtts);
183
184
185            //Start at beginning of RecordList and progress
186            //through to end using global iPos variable
187            //to keep track of position
188
189            iPos = 0;
190
191            //keep track of start pos of model -
192            //need this for multiple passes through
193            //to get protein, dna, solvent etc.
194
195            iModelStart = 0;
196            iModelStop = 0;
197            String oModelId;
198            String oStructureId;
199            while (iPos < oRecordList.size()) {
200                //System.out.println("Line: "+iPos);
201                oRecord = (String)oRecordList.get(iPos);
202
203                if (oRecord.startsWith("HEADER")) {
204                    oStructureId = oRecord.substring(62,66).trim();
205                    System.out.println(oStructureId);
206
207                    oAtts.clear();
208                    oAttQName.setQName("id");
209                    oAtts.addAttribute(oAttQName.getURI(),
210                                       oAttQName.getLocalName(),
211                                       oAttQName.getQName(),
212                                       "CDATA",oStructureId);
213
214                    //TODO EMPTY ELEMENT
215                    this.startElement(new QName(this,this.prefix("PdbCode")),
216                                      (Attributes)oAtts);
217                    this.endElement(new QName(this,this.prefix("PdbCode")));
218
219                }
220
221
222                if (oRecord.startsWith("MODEL")) {
223                    iModelStart = iPos;
224                    oModelId = oRecord.substring(10,14).trim();
225
226                    oAtts.clear();
227                    oAttQName.setQName("modelId");
228                    oAtts.addAttribute(oAttQName.getURI(),
229                                       oAttQName.getLocalName(),
230                                       oAttQName.getQName(),
231                                       "CDATA",oModelId);
232
233                    this.startElement(new QName(this,this.prefix("MolecularStructure")),
234                                      (Attributes)oAtts);
235
236                }
237
238                if (oRecord.startsWith("ENDMDL")) {
239                    //keep position of the end of this model
240                    iModelStop = iPos;
241
242                    //at this point have start and end positions
243                    //of current model
244
245                    //do multiple passes for each type of molecule
246
247                    //parse protein for this model...
248
249                    oAtts.clear();
250                    this.startElement(new QName(this,this.prefix("Protein")),
251                                      (Attributes)oAtts);
252
253
254                    oAtts.clear();
255                    this.startElement(new QName(this,
256                                                this.prefix("ProteinChainList")),
257                                      (Attributes)oAtts);
258
259
260                    this.parseProtein(iModelStart,iModelStop);
261                    //close final Atom Residue and ProteinChain
262
263                    this.endElement(new QName(this,this.prefix("Atom")));
264
265                    this.endElement(
266                                    new QName(this,this.prefix("AminoAcidResidue")));
267                    this.endElement(new QName(this,
268                                              this.prefix("ProteinChain")));
269                    this.endElement(new QName(this,
270                                              this.prefix("ProteinChainList")));
271
272                    //todo parse solvent, dna etc.
273
274                    //having parsed all content, end model
275                    this.endElement(new QName(this,this.prefix("MolecularStructure")));
276
277                }
278                iPos++;
279            }
280            this.endElement(new QName(this,
281                                      this.prefix("MolecularStructureList")));
282
283            //System.out.println("Finished parsing");
284
285        } catch (java.io.IOException x) {
286            System.out.println(x.getMessage());
287            System.out.println("File read interupted");
288        } // end try/catch
289
290    }
291    //==================================================================
292    //private methods
293    //==================================================================
294
295    /**
296     * Parse protein content of pdb output
297     *
298     * @param nil        -
299     */
300    private void parseProtein(int piStart, int piStop)
301        throws SAXException {
302
303        String oChainId;
304
305        String oAtomId;
306        String oAtomType;
307
308        String  oResidueId;
309        String  oResidueType;
310
311        String oX;
312        String oY;
313        String oZ;
314        String oOccupancy;
315        String oTemperatureFactor;
316        String oElement;
317
318        String oCurrentChainId;
319        String oCurrentResidueId;
320
321
322        boolean tFirstChain = true;
323        boolean tFirstResidue = true;
324
325        oCurrentChainId="XXX";    //set as an impossible initial value
326        oCurrentResidueId="A*ZZ**"; //set as an impossible initial value
327
328        for (int i = piStart; i < piStop; i++) {
329            oRecord = (String)oRecordList.get(i);
330            //System.out.println("parsing protein>" + oRecord);
331
332            if ( (oRecord.startsWith("ATOM  ")) ||
333                 (oRecord.startsWith("HETATM"))  ) {
334                //System.out.println(">"+oRecord.substring(17,20)+"<");
335
336                oAtomId = oRecord.substring(6,11).trim();
337                oAtomType = oRecord.substring(12,16).trim();
338
339                oResidueType = oRecord.substring(17,20).trim();
340
341                //go straight to next atom if this one not protein
342                if (!checkIfProtein(oResidueType)) continue;
343
344                //assign varables from ATOM record
345                oChainId = oRecord.substring(21,23).trim();
346                oResidueId = oRecord.substring(23,27).trim();
347
348                oX = oRecord.substring(30,38).trim();
349                oY = oRecord.substring(38,46).trim();
350                oZ = oRecord.substring(46,54).trim();
351
352                oOccupancy = oRecord.substring(54,60).trim();
353                oTemperatureFactor = oRecord.substring(60,66).trim();
354
355                oElement = oRecord.substring(76,78).trim();
356
357                //check new residue event
358
359                if (!oResidueId.equals(oCurrentResidueId)) {
360                    if (!tFirstResidue) {
361                        this.endElement(new QName(this,
362                                                  this.prefix("AminoAcidResidue")));
363
364                    }
365                    if (!oChainId.equals(oCurrentChainId)) {
366                        if (!tFirstChain) {
367
368                            this.endElement(new QName(this,
369                                                      this.prefix("ProteinChain")));
370
371                        }
372                        //check new chain event
373                        oAtts.clear();
374                        oAttQName.setQName("chainId");
375                        oAtts.addAttribute(oAttQName.getURI(),
376                                           oAttQName.getLocalName(),
377                                           oAttQName.getQName(),
378                                           "CDATA",oChainId);
379
380                        this.startElement(new QName(this,
381                                                    this.prefix("ProteinChain")),
382                                          (Attributes)oAtts);
383
384
385                        tFirstChain = false; //a bit ugly to set all the time.
386                        oCurrentChainId = oChainId;
387                    }
388
389                    oAtts.clear();
390                    oAttQName.setQName("residueId");
391                    oAtts.addAttribute(oAttQName.getURI(),
392                                       oAttQName.getLocalName(),
393                                       oAttQName.getQName(),
394                                       "CDATA",oResidueId);
395                    oAttQName.setQName("residueType");
396                    oAtts.addAttribute(oAttQName.getURI(),
397                                       oAttQName.getLocalName(),
398                                       oAttQName.getQName(),
399                                       "CDATA",oResidueType);
400
401                    this.startElement(
402                                      new QName(this,this.prefix("AminoAcidResidue")),
403                                      (Attributes)oAtts);
404
405                    tFirstResidue = false; //a bit ugly to set all the time.
406                    oCurrentResidueId = oResidueId;
407                }
408
409                //finally fire new atom-related events
410
411                oAtts.clear();
412                oAttQName.setQName("atomId");
413                oAtts.addAttribute(oAttQName.getURI(),
414                                   oAttQName.getLocalName(),
415                                   oAttQName.getQName(),
416                                   "CDATA",oAtomId);
417                oAttQName.setQName("atomType");
418                oAtts.addAttribute(oAttQName.getURI(),
419                                   oAttQName.getLocalName(),
420                                   oAttQName.getQName(),
421                                   "CDATA",oAtomType);
422
423                if ( ! oElement.equals("") ) {
424                    oAttQName.setQName("element");
425                    oAtts.addAttribute(oAttQName.getURI(),
426                                       oAttQName.getLocalName(),
427                                       oAttQName.getQName(),
428                                       "CDATA", oElement);
429                }
430
431                oAttQName.setQName("occupancy");
432                oAtts.addAttribute(oAttQName.getURI(),
433                                   oAttQName.getLocalName(),
434                                   oAttQName.getQName(),
435                                   "CDATA",oOccupancy);
436
437                oAttQName.setQName("temperatureFactor");
438                oAtts.addAttribute(oAttQName.getURI(),
439                                   oAttQName.getLocalName(),
440                                   oAttQName.getQName(),
441                                   "CDATA", oTemperatureFactor);
442
443
444                this.startElement(new QName(this,this.prefix("Atom")),
445                                  (Attributes)oAtts);
446
447
448                oAtts.clear();
449                oAttQName.setQName("x");
450                oAtts.addAttribute(oAttQName.getURI(),
451                                   oAttQName.getLocalName(),
452                                   oAttQName.getQName(),
453                                   "CDATA",oX);
454                oAttQName.setQName("y");
455                oAtts.addAttribute(oAttQName.getURI(),
456                                   oAttQName.getLocalName(),
457                                   oAttQName.getQName(),
458                                   "CDATA",oY);
459
460                oAttQName.setQName("z");
461                oAtts.addAttribute(oAttQName.getURI(),
462                                   oAttQName.getLocalName(),
463                                   oAttQName.getQName(),
464                                   "CDATA",oZ);
465
466
467                this.startElement(new QName(this,this.prefix("Coordinates")),
468                                  (Attributes)oAtts);
469
470
471                this.endElement(new QName(this,this.prefix("Coordinates")));
472                this.endElement(new QName(this,this.prefix("Atom")));
473
474            }
475        }
476
477    }
478
479
480    /**
481     * Checks to see if a given residue type is part of a protein.
482     * NB at the moment, this doesn't work - just returns true.
483     * FIX THIS
484     *
485     * @param poResType  Three-letter residue code
486     * @return boolean   Returns true if a protein, false if not.
487     */
488    private boolean checkIfProtein(String poResType) {
489
490        return true;
491    }
492
493
494    /**
495     * Takes a a line. If shorted that 80 characters
496     * returns a new version of the line, with spaces
497     * appended so that it is 80 characers.
498     *
499     * @param poLine a <code>String</code> value
500     * @return a <code>String</code> value
501     */
502    private String padLine(String poLine) {
503
504        int iLength = poLine.length();
505
506        int iDesiredLength = 80;
507        char cPadChar = ' ';
508
509        //do nothing if line length more than or equals to 80
510
511        if (iLength >= 80) {
512            return poLine;
513        }
514
515        //else pad with spaces
516
517        //System.out.println("Length: " + poLine.length());
518
519        StringBuffer oBuff = new StringBuffer(poLine);
520
521        int iInsertLength = iDesiredLength - iLength;
522
523        char[] aoInsert = new char[iInsertLength];
524
525        //      System.out.println("Insert Length: " + iInsertLength);
526
527        for (int i = 0; i < iInsertLength; i++) {
528            aoInsert[i] = cPadChar;
529        }
530
531        oBuff.append(aoInsert);
532
533        return oBuff.substring(0);
534    }
535
536}