001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.program.sax; 022 023import java.io.BufferedReader; 024import java.io.IOException; 025import java.util.ArrayList; 026 027import org.xml.sax.Attributes; 028import org.xml.sax.InputSource; 029import org.xml.sax.SAXException; 030import org.xml.sax.helpers.AttributesImpl; 031 032/** 033 * A SAX2 parser for dealing with native PDB files. That is, 034 * this class allows native PDB format files to be processed 035 * as if they were in PdbXML format, but without an interconversion 036 * step. That is, events are generated that call methods 037 * on an XML document handler. 038 * <p> 039 * <b>Note this code is experimental, and may change without notice. 040 * 041 * </b> 042 * <p> 043 * 044 * Copyright © 2000 - 2002 Cambridge Antibody Technology. 045 * 046 * <p> 047 * Primary author -<ul> 048 * <li>Simon Brocklehurst (CAT) 049 * </ul> 050 * Other authors -<ul> 051 * <li>Neil Benn (CAT) 052 * <li>Derek Crockford (CAT) 053 * <li>Tim Dilks (CAT) 054 * <li>Colin Hardman (CAT) 055 * <li>Stuart Johnston (CAT) 056 *</ul> 057 * 058 * 059 * @author Cambridge Antibody Technology (CAT) 060 * @author Greg Cox 061 * @version 0.8 062 * 063 */ 064public class PdbSAXParser extends AbstractNativeAppSAXParser { 065 066 067 private ArrayList oRecordList = new ArrayList(); 068 private String oRecord; 069 private int iPos; 070 private int iModelStart; 071 private int iModelStop; 072 073 private AttributesImpl oAtts = new AttributesImpl(); 074 private QName oAttQName = new QName(this); 075 076 /** 077 * Sets namespace prefix to "biojava" 078 */ 079 public PdbSAXParser() { 080 this.setNamespacePrefix("biojava"); 081 } 082 083 public void parse (String poURI) throws IOException,SAXException { 084 this.parse(new InputSource(poURI)); 085 } 086 /** 087 * Describe 'parse' method here. 088 * 089 * @param poSource - 090 */ 091 public void parse(InputSource poSource ) 092 throws IOException,SAXException { 093 094 BufferedReader oContents; 095 String oLine = null; 096 097 //Use method form superclass 098 oContents = this.getContentStream(poSource); 099 100 101 try { 102 // loop over file 103 oLine = oContents.readLine(); 104 while (oLine != null) { 105 String oPadLine = this.padLine(oLine); 106 // put padded line into ArrayList 107 oRecordList.add(oPadLine); 108 //System.out.println(oLine); 109 oLine = oContents.readLine(); 110 } // end while 111 112 //----------------------- 113 114 //At this point, have the entire raw file in core memory. 115 //Now parse it and fire of relevant events 116 117 //First preprocess file 118 119 //Rule 120 //If there are no model records, then insert records 121 //for a single model. MODEL record before first ATOM, 122 //ENDMDL and before, CONECT, MASTER, END 123 124 boolean tIsModel = false; 125 126 for (int i = 0; i < oRecordList.size(); i++) { 127 oRecord = (String)oRecordList.get(i); 128 if (oRecord.startsWith("MODEL")) { 129 tIsModel = true; 130 break; 131 } 132 } 133 134 boolean tFoundFirstAtom = false; 135 if (!tIsModel) { 136 //System.out.println("No MODEL records"); 137 for (int i = 0; i < oRecordList.size(); i++) { 138 oRecord = (String)oRecordList.get(i); 139 140 if ( ((oRecord.startsWith("ATOM ")) || 141 (oRecord.startsWith("HETATM"))) && 142 (!tFoundFirstAtom)) { 143 tFoundFirstAtom = true; 144 145 //System.out.println("Found first atom>"+i+"<"); 146 147 oRecordList.add(i,"MODEL 1"); 148 break; 149 } 150 } 151 152 boolean tFoundLastAtom = false; 153 for (int i = oRecordList.size() - 1; i > 0; i--) { 154 oRecord = (String)oRecordList.get(i); 155 156 if ( ((oRecord.startsWith("ATOM ")) || 157 (oRecord.startsWith("HETATM")) || 158 (oRecord.startsWith("TER") )) && 159 (!tFoundLastAtom)) { 160 161 tFoundLastAtom = true; 162 163 //System.out.println("Found last atom>"+i+"<"); 164 165 oRecordList.add(i+1,"ENDMDL"); 166 break; 167 } 168 } 169 170 } //end if tIsModel == false 171 172 173 //End preprocess file 174 175 //At this point, the PDB records should be 176 //in a suitable state for parsing... 177 178 179 oAtts.clear(); 180 this.startElement(new QName(this, 181 this.prefix("MolecularStructureList")), 182 (Attributes)oAtts); 183 184 185 //Start at beginning of RecordList and progress 186 //through to end using global iPos variable 187 //to keep track of position 188 189 iPos = 0; 190 191 //keep track of start pos of model - 192 //need this for multiple passes through 193 //to get protein, dna, solvent etc. 194 195 iModelStart = 0; 196 iModelStop = 0; 197 String oModelId; 198 String oStructureId; 199 while (iPos < oRecordList.size()) { 200 //System.out.println("Line: "+iPos); 201 oRecord = (String)oRecordList.get(iPos); 202 203 if (oRecord.startsWith("HEADER")) { 204 oStructureId = oRecord.substring(62,66).trim(); 205 System.out.println(oStructureId); 206 207 oAtts.clear(); 208 oAttQName.setQName("id"); 209 oAtts.addAttribute(oAttQName.getURI(), 210 oAttQName.getLocalName(), 211 oAttQName.getQName(), 212 "CDATA",oStructureId); 213 214 //TODO EMPTY ELEMENT 215 this.startElement(new QName(this,this.prefix("PdbCode")), 216 (Attributes)oAtts); 217 this.endElement(new QName(this,this.prefix("PdbCode"))); 218 219 } 220 221 222 if (oRecord.startsWith("MODEL")) { 223 iModelStart = iPos; 224 oModelId = oRecord.substring(10,14).trim(); 225 226 oAtts.clear(); 227 oAttQName.setQName("modelId"); 228 oAtts.addAttribute(oAttQName.getURI(), 229 oAttQName.getLocalName(), 230 oAttQName.getQName(), 231 "CDATA",oModelId); 232 233 this.startElement(new QName(this,this.prefix("MolecularStructure")), 234 (Attributes)oAtts); 235 236 } 237 238 if (oRecord.startsWith("ENDMDL")) { 239 //keep position of the end of this model 240 iModelStop = iPos; 241 242 //at this point have start and end positions 243 //of current model 244 245 //do multiple passes for each type of molecule 246 247 //parse protein for this model... 248 249 oAtts.clear(); 250 this.startElement(new QName(this,this.prefix("Protein")), 251 (Attributes)oAtts); 252 253 254 oAtts.clear(); 255 this.startElement(new QName(this, 256 this.prefix("ProteinChainList")), 257 (Attributes)oAtts); 258 259 260 this.parseProtein(iModelStart,iModelStop); 261 //close final Atom Residue and ProteinChain 262 263 this.endElement(new QName(this,this.prefix("Atom"))); 264 265 this.endElement( 266 new QName(this,this.prefix("AminoAcidResidue"))); 267 this.endElement(new QName(this, 268 this.prefix("ProteinChain"))); 269 this.endElement(new QName(this, 270 this.prefix("ProteinChainList"))); 271 272 //todo parse solvent, dna etc. 273 274 //having parsed all content, end model 275 this.endElement(new QName(this,this.prefix("MolecularStructure"))); 276 277 } 278 iPos++; 279 } 280 this.endElement(new QName(this, 281 this.prefix("MolecularStructureList"))); 282 283 //System.out.println("Finished parsing"); 284 285 } catch (java.io.IOException x) { 286 System.out.println(x.getMessage()); 287 System.out.println("File read interupted"); 288 } // end try/catch 289 290 } 291 //================================================================== 292 //private methods 293 //================================================================== 294 295 /** 296 * Parse protein content of pdb output 297 * 298 * @param nil - 299 */ 300 private void parseProtein(int piStart, int piStop) 301 throws SAXException { 302 303 String oChainId; 304 305 String oAtomId; 306 String oAtomType; 307 308 String oResidueId; 309 String oResidueType; 310 311 String oX; 312 String oY; 313 String oZ; 314 String oOccupancy; 315 String oTemperatureFactor; 316 String oElement; 317 318 String oCurrentChainId; 319 String oCurrentResidueId; 320 321 322 boolean tFirstChain = true; 323 boolean tFirstResidue = true; 324 325 oCurrentChainId="XXX"; //set as an impossible initial value 326 oCurrentResidueId="A*ZZ**"; //set as an impossible initial value 327 328 for (int i = piStart; i < piStop; i++) { 329 oRecord = (String)oRecordList.get(i); 330 //System.out.println("parsing protein>" + oRecord); 331 332 if ( (oRecord.startsWith("ATOM ")) || 333 (oRecord.startsWith("HETATM")) ) { 334 //System.out.println(">"+oRecord.substring(17,20)+"<"); 335 336 oAtomId = oRecord.substring(6,11).trim(); 337 oAtomType = oRecord.substring(12,16).trim(); 338 339 oResidueType = oRecord.substring(17,20).trim(); 340 341 //go straight to next atom if this one not protein 342 if (!checkIfProtein(oResidueType)) continue; 343 344 //assign varables from ATOM record 345 oChainId = oRecord.substring(21,23).trim(); 346 oResidueId = oRecord.substring(23,27).trim(); 347 348 oX = oRecord.substring(30,38).trim(); 349 oY = oRecord.substring(38,46).trim(); 350 oZ = oRecord.substring(46,54).trim(); 351 352 oOccupancy = oRecord.substring(54,60).trim(); 353 oTemperatureFactor = oRecord.substring(60,66).trim(); 354 355 oElement = oRecord.substring(76,78).trim(); 356 357 //check new residue event 358 359 if (!oResidueId.equals(oCurrentResidueId)) { 360 if (!tFirstResidue) { 361 this.endElement(new QName(this, 362 this.prefix("AminoAcidResidue"))); 363 364 } 365 if (!oChainId.equals(oCurrentChainId)) { 366 if (!tFirstChain) { 367 368 this.endElement(new QName(this, 369 this.prefix("ProteinChain"))); 370 371 } 372 //check new chain event 373 oAtts.clear(); 374 oAttQName.setQName("chainId"); 375 oAtts.addAttribute(oAttQName.getURI(), 376 oAttQName.getLocalName(), 377 oAttQName.getQName(), 378 "CDATA",oChainId); 379 380 this.startElement(new QName(this, 381 this.prefix("ProteinChain")), 382 (Attributes)oAtts); 383 384 385 tFirstChain = false; //a bit ugly to set all the time. 386 oCurrentChainId = oChainId; 387 } 388 389 oAtts.clear(); 390 oAttQName.setQName("residueId"); 391 oAtts.addAttribute(oAttQName.getURI(), 392 oAttQName.getLocalName(), 393 oAttQName.getQName(), 394 "CDATA",oResidueId); 395 oAttQName.setQName("residueType"); 396 oAtts.addAttribute(oAttQName.getURI(), 397 oAttQName.getLocalName(), 398 oAttQName.getQName(), 399 "CDATA",oResidueType); 400 401 this.startElement( 402 new QName(this,this.prefix("AminoAcidResidue")), 403 (Attributes)oAtts); 404 405 tFirstResidue = false; //a bit ugly to set all the time. 406 oCurrentResidueId = oResidueId; 407 } 408 409 //finally fire new atom-related events 410 411 oAtts.clear(); 412 oAttQName.setQName("atomId"); 413 oAtts.addAttribute(oAttQName.getURI(), 414 oAttQName.getLocalName(), 415 oAttQName.getQName(), 416 "CDATA",oAtomId); 417 oAttQName.setQName("atomType"); 418 oAtts.addAttribute(oAttQName.getURI(), 419 oAttQName.getLocalName(), 420 oAttQName.getQName(), 421 "CDATA",oAtomType); 422 423 if ( ! oElement.equals("") ) { 424 oAttQName.setQName("element"); 425 oAtts.addAttribute(oAttQName.getURI(), 426 oAttQName.getLocalName(), 427 oAttQName.getQName(), 428 "CDATA", oElement); 429 } 430 431 oAttQName.setQName("occupancy"); 432 oAtts.addAttribute(oAttQName.getURI(), 433 oAttQName.getLocalName(), 434 oAttQName.getQName(), 435 "CDATA",oOccupancy); 436 437 oAttQName.setQName("temperatureFactor"); 438 oAtts.addAttribute(oAttQName.getURI(), 439 oAttQName.getLocalName(), 440 oAttQName.getQName(), 441 "CDATA", oTemperatureFactor); 442 443 444 this.startElement(new QName(this,this.prefix("Atom")), 445 (Attributes)oAtts); 446 447 448 oAtts.clear(); 449 oAttQName.setQName("x"); 450 oAtts.addAttribute(oAttQName.getURI(), 451 oAttQName.getLocalName(), 452 oAttQName.getQName(), 453 "CDATA",oX); 454 oAttQName.setQName("y"); 455 oAtts.addAttribute(oAttQName.getURI(), 456 oAttQName.getLocalName(), 457 oAttQName.getQName(), 458 "CDATA",oY); 459 460 oAttQName.setQName("z"); 461 oAtts.addAttribute(oAttQName.getURI(), 462 oAttQName.getLocalName(), 463 oAttQName.getQName(), 464 "CDATA",oZ); 465 466 467 this.startElement(new QName(this,this.prefix("Coordinates")), 468 (Attributes)oAtts); 469 470 471 this.endElement(new QName(this,this.prefix("Coordinates"))); 472 this.endElement(new QName(this,this.prefix("Atom"))); 473 474 } 475 } 476 477 } 478 479 480 /** 481 * Checks to see if a given residue type is part of a protein. 482 * NB at the moment, this doesn't work - just returns true. 483 * FIX THIS 484 * 485 * @param poResType Three-letter residue code 486 * @return boolean Returns true if a protein, false if not. 487 */ 488 private boolean checkIfProtein(String poResType) { 489 490 return true; 491 } 492 493 494 /** 495 * Takes a a line. If shorted that 80 characters 496 * returns a new version of the line, with spaces 497 * appended so that it is 80 characers. 498 * 499 * @param poLine a <code>String</code> value 500 * @return a <code>String</code> value 501 */ 502 private String padLine(String poLine) { 503 504 int iLength = poLine.length(); 505 506 int iDesiredLength = 80; 507 char cPadChar = ' '; 508 509 //do nothing if line length more than or equals to 80 510 511 if (iLength >= 80) { 512 return poLine; 513 } 514 515 //else pad with spaces 516 517 //System.out.println("Length: " + poLine.length()); 518 519 StringBuffer oBuff = new StringBuffer(poLine); 520 521 int iInsertLength = iDesiredLength - iLength; 522 523 char[] aoInsert = new char[iInsertLength]; 524 525 // System.out.println("Insert Length: " + iInsertLength); 526 527 for (int i = 0; i < iInsertLength; i++) { 528 aoInsert[i] = cPadChar; 529 } 530 531 oBuff.append(aoInsert); 532 533 return oBuff.substring(0); 534 } 535 536}