001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.util.ArrayList; 032import java.util.Iterator; 033import java.util.List; 034import java.util.Map; 035import java.util.Set; 036import java.util.TreeMap; 037import java.util.TreeSet; 038import java.util.regex.Matcher; 039import java.util.regex.Pattern; 040 041import org.biojava.bio.proteomics.MassCalc; 042import org.biojava.bio.seq.Sequence; 043import org.biojava.bio.seq.io.ParseException; 044import org.biojava.bio.seq.io.SeqIOListener; 045import org.biojava.bio.seq.io.SymbolTokenization; 046import org.biojava.bio.symbol.IllegalAlphabetException; 047import org.biojava.bio.symbol.IllegalSymbolException; 048import org.biojava.bio.symbol.SimpleSymbolList; 049import org.biojava.bio.symbol.Symbol; 050import org.biojava.bio.symbol.SymbolList; 051import org.biojava.ontology.Term; 052import org.biojava.utils.ChangeVetoException; 053import org.biojavax.Comment; 054import org.biojavax.CrossRef; 055import org.biojavax.DocRef; 056import org.biojavax.DocRefAuthor; 057import org.biojavax.Namespace; 058import org.biojavax.Note; 059import org.biojavax.RankedCrossRef; 060import org.biojavax.RankedDocRef; 061import org.biojavax.RichObjectFactory; 062import org.biojavax.SimpleComment; 063import org.biojavax.SimpleCrossRef; 064import org.biojavax.SimpleDocRef; 065import org.biojavax.SimpleDocRefAuthor; 066import org.biojavax.SimpleNote; 067import org.biojavax.SimpleRankedCrossRef; 068import org.biojavax.SimpleRankedDocRef; 069import org.biojavax.SimpleRichAnnotation; 070import org.biojavax.bio.seq.RichFeature; 071import org.biojavax.bio.seq.RichLocation; 072import org.biojavax.bio.seq.RichSequence; 073import org.biojavax.bio.taxa.NCBITaxon; 074import org.biojavax.bio.taxa.SimpleNCBITaxon; 075import org.biojavax.ontology.ComparableTerm; 076import org.biojavax.utils.CRC64Checksum; 077import org.biojavax.utils.StringTools; 078 079/** 080 * Format reader for UniProt files. This version of UniProt format will generate 081 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 082 * org.biojava.bio.seq.io.EMBLLikeFormat object. Since 1.7, the parser reads the 083 * International Protein Index (IPI) pseudo-Uniprot format. 084 * 085 * @author Richard Holland 086 * @author Mark Schreiber 087 * @author George Waldon 088 * @since 1.5 089 */ 090public class UniProtFormat extends RichSequenceFormat.HeaderlessFormat { 091 092 // Register this format with the format auto-guesser. 093 static { 094 RichSequence.IOTools.registerFormat(UniProtFormat.class); 095 } 096 097 /** 098 * The name of this format 099 */ 100 public static final String UNIPROT_FORMAT = "UniProt"; 101 102 private static final String SUBFORMAT_UNIPROT = "UniProt"; 103 private static final String SUBFORMAT_IPI = "IPI"; 104 105 protected static final String LOCUS_TAG = "ID"; 106 protected static final String ACCESSION_TAG = "AC"; 107 protected static final String DEFINITION_TAG = "DE"; 108 protected static final String DATE_TAG = "DT"; 109 protected static final String SOURCE_TAG = "OS"; 110 protected static final String ORGANELLE_TAG = "OG"; 111 protected static final String ORGANISM_TAG = "OC"; 112 protected static final String TAXON_TAG = "OX"; 113 protected static final String GENE_TAG = "GN"; 114 protected static final String DATABASE_XREF_TAG = "DR"; 115 protected static final String PROTEIN_EXIST_TAG = "PE"; 116 protected static final String REFERENCE_TAG = "RN"; 117 protected static final String RP_LINE_TAG = "RP"; 118 protected static final String REFERENCE_XREF_TAG = "RX"; 119 protected static final String AUTHORS_TAG = "RA"; 120 protected static final String CONSORTIUM_TAG = "RG"; 121 protected static final String TITLE_TAG = "RT"; 122 protected static final String LOCATION_TAG = "RL"; 123 protected static final String RC_LINE_TAG = "RC"; 124 protected static final String KEYWORDS_TAG = "KW"; 125 protected static final String COMMENT_TAG = "CC"; 126 protected static final String FEATURE_TAG = "FT"; 127 protected static final String START_SEQUENCE_TAG = "SQ"; 128 protected static final String END_SEQUENCE_TAG = "//"; 129 130 // locus line for uniprot format 131 protected static final Pattern lp_uniprot = Pattern.compile("^((\\S+)_(\\S+))\\s+(\\S+);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$"); 132 // locus line for IPI format 133 protected static final Pattern lp_ipi = Pattern.compile("^((\\S+)\\.(\\d+))\\s+(IPI);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$"); 134 // RP line parser 135 protected static final Pattern rppat = Pattern.compile("SEQUENCE OF (\\d+)-(\\d+)"); 136 // date lineDT for uniprot 137 // date, integrated into UniProtKB/database_name. 138 // date, sequence version x. 139 // date, entry version x. 140 protected static final Pattern dp_uniprot = Pattern.compile("([^,]+),([^\\d\\.]+)(\\d+)?\\.$"); 141 // date lineDT for IPI 142 // date (xxx, Created) 143 // date (xxx, Last sequence update) 144 protected static final Pattern dp_ipi = Pattern.compile("([^\\(]+)\\(([^,]+),([^\\)]+)\\)$"); 145 // feature line 146 protected static final Pattern fp = Pattern.compile("^\\s*([\\d?<]+\\s+[\\d?>]+)(\\s+(.*))?$"); 147 148 protected static final Pattern headerLine = Pattern.compile("^ID.*"); 149 150 /** 151 * Implements some UniProt-specific terms. 152 */ 153 public static class Terms extends RichSequence.Terms { 154 private static String GENENAME_KEY = "Name"; 155 private static String GENESYNONYM_KEY = "Synonyms"; 156 private static String ORDLOCNAME_KEY = "OrderedLocusNames"; 157 private static String ORFNAME_KEY = "ORFNames"; 158 159 /** 160 * Getter for the UniProt term 161 * @return The UniProt Term 162 */ 163 public static ComparableTerm getUniProtTerm() { 164 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt"); 165 } 166 167 /** 168 * Getter for the UniProt combined database term 169 * @return The combined database for UniProt Term 170 */ 171 public static ComparableTerm getUniProtDBNameTerm() { 172 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt database name"); 173 } 174 175 /** 176 * Getter for the protein exists term 177 * @return The protein exists Term 178 */ 179 public static ComparableTerm getProteinExistsTerm() { 180 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt protein exists"); 181 } 182 } 183 184 /** 185 * {@inheritDoc} 186 * A file is in UniProt format if the first line matches the UniProt format for the ID line. 187 */ 188 public boolean canRead(File file) throws IOException { 189 BufferedReader br = new BufferedReader(new FileReader(file)); 190 String firstLine = br.readLine(); 191 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && 192 (lp_uniprot.matcher(firstLine.substring(3).trim()).matches() || 193 lp_ipi.matcher(firstLine.substring(3).trim()).matches()); 194 br.close(); 195 return readable; 196 } 197 198 /** 199 * {@inheritDoc} 200 * Always returns a protein tokenizer. 201 */ 202 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 203 return RichSequence.IOTools.getProteinParser(); 204 } 205 206 /** 207 * {@inheritDoc} 208 * A stream is in UniProt format if the first line matches the UniProt format for the ID line. 209 */ 210 public boolean canRead(BufferedInputStream stream) throws IOException { 211 stream.mark(2000); // some streams may not support this 212 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 213 String firstLine = br.readLine(); 214 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && 215 (lp_uniprot.matcher(firstLine.substring(3).trim()).matches() 216 || lp_ipi.matcher(firstLine.substring(3).trim()).matches()); 217 // don't close the reader as it'll close the stream too. 218 // br.close(); 219 stream.reset(); 220 return readable; 221 } 222 223 /** 224 * {@inheritDoc} 225 * Always returns a protein tokenizer. 226 */ 227 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 228 return RichSequence.IOTools.getProteinParser(); 229 } 230 231 /** 232 * {@inheritDoc} 233 */ 234 public boolean readSequence(BufferedReader reader, 235 SymbolTokenization symParser, 236 SeqIOListener listener) 237 throws IllegalSymbolException, IOException, ParseException { 238 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 239 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 240 } 241 242 private String accession = null; 243 244 /** 245 * {@inheritDoc} 246 */ 247 public boolean readRichSequence(BufferedReader reader, 248 SymbolTokenization symParser, 249 RichSeqIOListener rlistener, 250 Namespace ns) 251 throws IllegalSymbolException, IOException, ParseException { 252 253 boolean hasAnotherSequence = true; 254 //boolean hasInternalWhitespace = false; 255 256 String subformat = SUBFORMAT_UNIPROT; 257 258 rlistener.startSequence(); 259 260 if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); 261 rlistener.setNamespace(ns); 262 263 // Get an ordered list of key->value pairs in array-tuples 264 String sectionKey = null; 265 NCBITaxon tax = null; 266 accession = null; 267 List section = null; 268 try{ 269 do { 270 271 section = this.readSection(reader); 272 sectionKey = ((String[])section.get(0))[0]; 273 if(sectionKey == null){ 274 String message = ParseException.newMessage(this.getClass(),accession, "", "Section key was null", sectionToString(section)); 275 throw new ParseException(message); 276 } 277 // process section-by-section 278 if (sectionKey.equals(LOCUS_TAG)) { 279 // entryname dataclass; moltype; sequencelength AA. 280 String loc = ((String[])section.get(0))[1]; 281 Matcher m = lp_uniprot.matcher(loc); 282 if (m.matches()) { 283 rlistener.setName(m.group(2)); 284 rlistener.setDivision(m.group(3)); 285 if (m.groupCount() > 4){ 286 rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(4)); 287 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5)); 288 }else{ 289 rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4)); 290 rlistener.addSequenceProperty(Terms.getMolTypeTerm(), ""); 291 } 292 } else { 293 m = lp_ipi.matcher(loc); 294 if (m.matches()) { 295 subformat = SUBFORMAT_IPI; 296 rlistener.setName(m.group(2)); 297 rlistener.setVersion(Integer.parseInt(m.group(3))); 298 rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4)); 299 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5)); 300 } else { 301 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad ID line", sectionToString(section)); 302 throw new ParseException(message); 303 } 304 } 305 } else if (sectionKey.equals(DEFINITION_TAG)) { 306 String val = ((String[])section.get(0))[1]; 307 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 308 rlistener.setDescription(val); 309 } else if (sectionKey.equals(SOURCE_TAG)) { 310 // use SOURCE_TAG and TAXON_TAG values 311 String sciname = null; 312 String comname = null; 313 List synonym = new ArrayList(); 314 int taxid = 0; 315 for (int i = 0; i < section.size(); i++) { 316 String tag = ((String[])section.get(i))[0]; 317 String value = ((String[])section.get(i))[1].trim(); 318 value = value.replace("\n", " "); 319 value = value.replace("\r\n", " "); 320 321 if (tag.equals(SOURCE_TAG)) { 322 if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot 323 String[] parts = value.split("\\("); 324 sciname = parts[0].trim(); 325 if (parts.length>1) { 326 comname = parts[1].trim(); 327 if (comname.endsWith(")")) comname = comname.substring(0,comname.length()-1); // chomp trailing bracket 328 if (parts.length>2) { 329 // synonyms 330 for (int j = 2 ; j < parts.length; j++) { 331 String syn = parts[j].trim(); 332 if (syn.endsWith(")")) syn = syn.substring(0,syn.length()-1); // chomp trailing bracket 333 synonym.add(syn); 334 } 335 } 336 } 337 } else if (tag.equals(TAXON_TAG)) { 338 String[] parts = value.split(";"); 339 for (int j = 0; j < parts.length; j++) { 340 String[] bits = parts[j].split("="); 341 if (bits[0].equals("NCBI_TaxID")) { 342 String[] morebits = bits[1].split(","); 343 taxid = Integer.parseInt(morebits[0].trim()); 344 } 345 } 346 } else if (tag.equals(ORGANELLE_TAG)) { 347 if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot 348 String[] parts = value.split(";"); 349 for (int j = 0; j < parts.length; j++) { 350 parts[j]=parts[j].trim(); 351 rlistener.addSequenceProperty(Terms.getOrganelleTerm(),parts[j]); 352 } 353 } 354 } 355 // Set the Taxon 356 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{new Integer(taxid)}); 357 rlistener.setTaxon(tax); 358 try { 359 if (sciname!=null) tax.addName(NCBITaxon.SCIENTIFIC,sciname); 360 if (comname!=null) tax.addName(NCBITaxon.COMMON,comname); 361 for (Iterator j = synonym.iterator(); j.hasNext(); ) tax.addName(NCBITaxon.SYNONYM, (String)j.next()); 362 } catch (ChangeVetoException e) { 363 throw new ParseException(e); 364 } 365 } else if (sectionKey.equals(DATE_TAG)) { 366 String chunk = ((String[])section.get(0))[1]; 367 if(subformat.equals(SUBFORMAT_UNIPROT)) { 368 Matcher dm = dp_uniprot.matcher(chunk); 369 if (dm.matches()) { 370 String date = dm.group(1).trim(); 371 String type = dm.group(2).trim(); 372 String rel = dm.group(3); 373 if (rel!=null) rel = rel.trim(); 374 if (type.startsWith("integrated into UniProtKB")) { 375 String dbname = type.split("/")[1]; 376 rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); 377 rlistener.addSequenceProperty(Terms.getUniProtDBNameTerm(), dbname); 378 } else if (type.equalsIgnoreCase("sequence version")) { 379 if (rel==null){ 380 String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section)); 381 throw new ParseException(message); 382 } 383 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); 384 rlistener.setVersion(Integer.parseInt(rel)); 385 } else if (type.equalsIgnoreCase("entry version")) { 386 if (rel==null) { 387 String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section)); 388 throw new ParseException(message); 389 } 390 rlistener.addSequenceProperty(Terms.getDateAnnotatedTerm(), date); 391 rlistener.addSequenceProperty(Terms.getRelAnnotatedTerm(), rel); 392 } else { 393 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section)); 394 throw new ParseException(message); 395 } 396 } else { 397 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section)); 398 throw new ParseException(message); 399 } 400 } else if(subformat.equals(SUBFORMAT_IPI)) { 401 Matcher dm = dp_ipi.matcher(chunk); 402 if (dm.matches()) { 403 String date = dm.group(1).trim(); 404 String type = dm.group(3).trim(); 405 if(type.equals("Created")) { 406 rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); 407 } else if(type.equals("Last sequence update")) { 408 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); 409 } else { 410 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section)); 411 throw new ParseException(message); 412 } 413 } else { 414 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section)); 415 throw new ParseException(message); 416 } 417 } else { 418 String message = ParseException.newMessage(this.getClass(),accession, "", "Unknown date line format", sectionToString(section)); 419 throw new ParseException(message); 420 } 421 } else if (sectionKey.equals(ACCESSION_TAG)) { 422 // if multiple accessions, store only first as accession, 423 // and store rest in annotation 424 String[] accs = ((String[])section.get(0))[1].split(";"); 425 if(accs.length>0) accession = accs[0].trim(); else accession = ""; 426 rlistener.setAccession(accession); 427 for (int i = 1; i < accs.length; i++) { 428 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); 429 } 430 } else if (sectionKey.equals(PROTEIN_EXIST_TAG)) { 431 String val = ((String[])section.get(0))[1]; 432 if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon 433 rlistener.addSequenceProperty(Terms.getProteinExistsTerm(),val.trim()); 434 } else if (sectionKey.equals(KEYWORDS_TAG)) { 435 String val = ((String[])section.get(0))[1]; 436 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 437 val = val.replace('\n',' '); //remove newline 438 String[] kws = val.split(";"); 439 for (int i = 0; i < kws.length; i++) { 440 String kw = kws[i].trim(); 441 if (kw.length()==0) continue; 442 rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); 443 } 444 } else if (sectionKey.equals(GENE_TAG)) { 445 String[] genes = ((String[])section.get(0))[1].split("\\s+(or|and)\\s+"); 446 for (int geneID = 0; geneID < genes.length; geneID++) { 447 String[] parts = genes[geneID].split(";"); 448 for (int j = 0; j < parts.length; j++) { 449 String[] moreparts = parts[j].split("="); 450 String[] values = moreparts[1].split(","); 451 // nasty hack - we really should have notes on the gene object itself... if such a thing existed... 452 if (moreparts[0].trim().equals(Terms.GENENAME_KEY)) rlistener.addSequenceProperty(Terms.getGeneNameTerm(),geneID+":"+values[0].trim()); 453 else if (moreparts[0].trim().equals(Terms.GENESYNONYM_KEY)) { 454 for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getGeneSynonymTerm(),geneID+":"+values[k].trim()); 455 } else if (moreparts[0].trim().equals(Terms.ORDLOCNAME_KEY)) { 456 for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getOrderedLocusNameTerm(),geneID+":"+values[k].trim()); 457 } else if (moreparts[0].trim().equals(Terms.ORFNAME_KEY)) { 458 for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getORFNameTerm(),geneID+":"+values[k].trim()); 459 } 460 } 461 } 462 } else if (sectionKey.equals(DATABASE_XREF_TAG)) { 463 // database_identifier; primary_identifier; secondary_identifier.... 464 String val = ((String[])section.get(0))[1]; 465 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 466 String[] parts = val.split(";"); 467 // construct a DBXREF out of the dbname part[0] and accession part[1] 468 String dbname = parts[0].trim(); 469 String acc = parts[1].trim(); 470 CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname,acc,new Integer(0)}); 471 // assign remaining bits of info as additional accession annotations 472 for (int j = 2; j < parts.length; j++) { 473 ComparableTerm t = (ComparableTerm)Terms.getAdditionalAccessionTerm(); 474 Note note = new SimpleNote(t,parts[j].trim(),j-1); 475 try { 476 crossRef.getRichAnnotation().addNote(note); 477 } catch (ChangeVetoException ce) { 478 ParseException pe = new ParseException("Could not annotate additional accession terms"); 479 pe.initCause(ce); 480 throw pe; 481 } 482 } 483 RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0); 484 rlistener.setRankedCrossRef(rcrossRef); 485 } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { 486 // first line of section has rank and location 487 String refrank = ((String[])section.get(0))[1]; 488 int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1)); 489 // rest can be in any order 490 String authors = null; 491 String consortium = null; 492 String title = null; 493 String locator = null; 494 String pubmed = null; 495 String medline = null; 496 String doi = null; 497 String remark = null; 498 Integer rstart = null; 499 Integer rend = null; 500 for (int i = 1; i < section.size(); i++) { 501 String key = ((String[])section.get(i))[0]; 502 String val = ((String[])section.get(i))[1]; 503 //System.err.println(key+": "+val); 504 if (key.equals(AUTHORS_TAG)) { 505 if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon 506 authors = val.replace('\n',' '); //see #2276 507 } 508 if (key.equals(CONSORTIUM_TAG)) { 509 if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon 510 consortium = val.replace('\n',' '); //see #2276 511 } 512 if (key.equals(TITLE_TAG)) { 513 if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon 514 if (val.endsWith("\"")) val = val.substring(1, val.length()-1); // chomp quotes 515 title = val.replace('\n',' '); //see #2276 516 } 517 if (key.equals(LOCATION_TAG)) { 518 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 519 locator = val.replace('\n',' '); //see #2276 520 } 521 if (key.equals(REFERENCE_XREF_TAG)) { 522 // database_identifier=primary_identifier; 523 String[] refs = val.split(";"); 524 for (int j = 0 ; j < refs.length; j++) { 525 if (refs[j].trim().length()==0) continue; 526 String[] parts = refs[j].split("="); 527 if ( parts.length <2) { 528 // some DOI lines look like this and are causing problems: 529 //DOI=10.1002/(SICI)1097-0215(19990702)82:1<137::AID-IJC23>3.0.CO;2-F;ignoring 530 System.err.println("warning: problems while parsing: " + val); 531 continue; 532 } 533 String db = parts[0].trim(); 534 String ref = parts[1].trim(); 535 if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref; 536 else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref; 537 else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref; 538 } 539 } 540 if (key.equals(RP_LINE_TAG)) { 541 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 542 remark = val.replace('\n',' '); //see #2276 543 // Try to use it to find the location of the reference, if we have one. 544 Matcher m = rppat.matcher(val); 545 if (m.matches()) { 546 rstart = Integer.valueOf(m.group(1)); 547 rend = Integer.valueOf(m.group(2)); 548 } 549 } 550 if (key.equals(RC_LINE_TAG)) { 551 // Split into key=value pairs separated by semicolons and terminated with semicolon. 552 String[] parts = val.split(";"); 553 for (int j = 0; j < parts.length; j++) { 554 String[] subparts = parts[j].split("="); 555 // get term for first section 556 String termName = subparts[0].trim(); 557 Term t; 558 if (termName.equalsIgnoreCase(Terms.SPECIES_KEY)) t = Terms.getSpeciesTerm(); 559 else if (termName.equalsIgnoreCase(Terms.STRAIN_KEY)) t = Terms.getStrainTerm(); 560 else if (termName.equalsIgnoreCase(Terms.TISSUE_KEY)) t = Terms.getTissueTerm(); 561 else if (termName.equalsIgnoreCase(Terms.TRANSPOSON_KEY)) t = Terms.getTransposonTerm(); 562 else if (termName.equalsIgnoreCase(Terms.PLASMID_KEY)) t = Terms.getPlasmidTerm(); 563 else { 564 String message = ParseException.newMessage(this.getClass(),accession, "", "Invalid RC term found: "+termName, sectionToString(section)); 565 throw new ParseException(message); 566 } 567 // assign notes using term and rank:second section as value 568 // nasty hack - we really should have notes on the reference itself. 569 rlistener.addSequenceProperty(t, ref_rank+":"+subparts[1].trim()); 570 } 571 } 572 } 573 574 // create the docref object 575 try { 576 List auths = null; 577 if(authors != null) auths = DocRefAuthor.Tools.parseAuthorString(authors); 578 if (consortium!=null){ 579 if(auths == null) auths = new ArrayList(); 580 auths.add(new SimpleDocRefAuthor(consortium,true,false)); 581 } 582 DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{auths,locator,title}); 583 // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi 584 if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); 585 else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); 586 else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)})); 587 // assign the remarks 588 if (!this.getElideComments()) dr.setRemark(remark); 589 // assign the docref to the bioentry 590 RankedDocRef rdr = new SimpleRankedDocRef(dr,rstart,rend,ref_rank); 591 rlistener.setRankedDocRef(rdr); 592 } catch (ChangeVetoException e) { 593 throw new ParseException(e); 594 } 595 } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { 596 // Set up some comments 597 String val = ((String[])section.get(0))[1]; 598 if (UniProtCommentParser.isParseable(val)) rlistener.setComment(val); 599 else { 600 // copyright message 601 rlistener.addSequenceProperty(Terms.getCopyrightTerm(), val); 602 } 603 } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { 604 // starting from second line of input, start a new feature whenever we come across 605 // a key that does not start with / 606 boolean seenAFeature = false; 607 for (int i = 1 ; i < section.size(); i++) { 608 String key = ((String[])section.get(i))[0]; 609 String val = ((String[])section.get(i))[1]; 610 val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); 611 if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot 612 if (key.startsWith("/")) { 613 key = key.substring(1); // strip leading slash 614 if (key.equals("FTId")) rlistener.addFeatureProperty(Terms.getFTIdTerm(),val); 615 else { 616 // should never happen - but here just in case 617 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); 618 } 619 } else { 620 // new feature! 621 // end previous feature 622 if (seenAFeature) rlistener.endFeature(); 623 // start next one, with lots of lovely info in it 624 RichFeature.Template templ = new RichFeature.Template(); 625 templ.annotation = new SimpleRichAnnotation(); 626 templ.sourceTerm = Terms.getUniProtTerm(); 627 templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); 628 templ.featureRelationshipSet = new TreeSet(); 629 templ.rankedCrossRefs = new TreeSet(); 630 String desc = null; 631 Matcher m = fp.matcher(val); 632 if (m.matches()) { 633 String loc = m.group(1); 634 desc = m.group(3); 635 templ.location = UniProtLocationParser.parseLocation(loc); 636 } else { 637 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad feature value: "+val, sectionToString(section)); 638 throw new ParseException(message); 639 } 640 rlistener.startFeature(templ); 641 if (desc!=null && desc.length()>0) rlistener.addFeatureProperty(Terms.getFeatureDescTerm(),desc); 642 seenAFeature = true; 643 } 644 } 645 if (seenAFeature) rlistener.endFeature(); 646 } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { 647 StringBuffer seq = new StringBuffer(); 648 for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]); 649 try { 650 SymbolList sl = new SimpleSymbolList(symParser, 651 seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 652 rlistener.addSymbols(symParser.getAlphabet(), 653 (Symbol[])(sl.toList().toArray(new Symbol[0])), 654 0, sl.length()); 655 } catch (IllegalAlphabetException e) { 656 String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); 657 throw new ParseException(e, message); 658 } 659 } 660 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 661 }catch (RuntimeException e){ 662 String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); 663 throw new ParseException(e, message); 664 } 665 666 // Allows us to tolerate trailing whitespace without 667 // thinking that there is another Sequence to follow 668 while (true) { 669 reader.mark(1); 670 int c = reader.read(); 671 if (c == -1) { 672 hasAnotherSequence = false; 673 break; 674 } 675 if (Character.isWhitespace((char) c)) { 676 //hasInternalWhitespace = true; 677 continue; 678 } 679 //if (hasInternalWhitespace) 680 //System.err.println("Warning: whitespace found between sequence entries"); 681 reader.reset(); 682 break; 683 } 684 685 // Finish up. 686 rlistener.endSequence(); 687 return hasAnotherSequence; 688 } 689 690// reads an indented section, combining split lines and creating a list of key->value tuples 691 private List readSection(BufferedReader br) throws ParseException { 692 List section = new ArrayList(); 693 String line; 694 boolean done = false; 695 696 // while not done 697 try { 698 while (!done) { 699 // mark buffer 700 br.mark(160); 701 // read token 702 line = br.readLine(); 703 if (line.length()<2) { 704 String message = ParseException.newMessage(this.getClass(),accession, "", "Bad line found: "+line, sectionToString(section)); 705 throw new ParseException(message); 706 } 707 String token = line.substring(0,2); 708 // READ SEQUENCE SECTION 709 if (token.equals(START_SEQUENCE_TAG)) { 710 // from next line, read sequence until // - leave // on stack 711 StringBuffer sb = new StringBuffer(); 712 while (!done) { 713 br.mark(160); 714 line = br.readLine(); 715 if (line.startsWith(END_SEQUENCE_TAG)) { 716 br.reset(); 717 done = true; 718 } else { 719 // create sequence tag->value pair to return, sans numbers 720 sb.append(line); 721 } 722 } 723 section.add(new String[]{START_SEQUENCE_TAG,sb.toString()}); 724 } 725 // READ COMMENT SECTION 726 else if (token.equals(COMMENT_TAG)) { 727 // read from first line till next that begins with "CC -!-" 728 StringBuffer currentVal = new StringBuffer(); 729 boolean wasMisc = false; 730 if (!line.startsWith(COMMENT_TAG+" -!-")) wasMisc = true; 731 currentVal.append(line.substring(5)); 732 while (!done) { 733 br.mark(160); 734 line = br.readLine(); 735 if (((!wasMisc) && line.charAt(5)!=' ') || !line.startsWith("C") || line.startsWith(COMMENT_TAG+" -!-")) { 736 br.reset(); 737 done = true; 738 // dump current tag if exists 739 section.add(new String[]{COMMENT_TAG,currentVal.toString()}); 740 } else { 741 currentVal.append("\n"); 742 currentVal.append(line.substring(5)); 743 } 744 } 745 } 746 // READ FEATURE TABLE SECTION 747 else if (token.equals(FEATURE_TAG)) { 748 br.reset(); 749 // read all FT lines until first non-FT starting line 750 String currentTag = null; 751 StringBuffer currentVal = new StringBuffer(); 752 section.add(new String[]{FEATURE_TAG,null}); 753 while (!done) { 754 br.mark(160); 755 line = br.readLine(); 756 if (!line.startsWith(FEATURE_TAG)) { 757 br.reset(); 758 done = true; 759 // dump current tag if exists 760 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 761 } else { 762 // FT lines: FT KEY_NAME x x description 763 // or: FT .... 764 // or FT /FTId=899. 765 line = line.substring(5); // chomp off "FT " 766 if (!line.startsWith(" ")) { 767 // dump current tag if exists 768 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 769 // case 1 : word value - splits into key-value based on first 8 chars 770 currentTag = line.substring(0,8).trim(); 771 currentVal = new StringBuffer(); 772 currentVal.append(line.substring(8).trim()); 773 } else { 774 line = line.trim(); 775 if (line.startsWith("/")) { 776 // dump current tag if exists 777 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 778 // case 3 : /word=..... 779 currentVal = new StringBuffer(); 780 int equalIndex = line.indexOf('='); 781 if (equalIndex>=0) { 782 currentTag = line.substring(0, equalIndex); 783 currentVal.append(line.substring(equalIndex+1)); 784 } else { 785 currentTag = line; 786 } 787 } else { 788 // case 2 : ...." 789 currentVal.append("\n"); 790 currentVal.append(line); 791 } 792 } 793 } 794 } 795 } 796 // READ DOCREF 797 else if (token.equals(DATABASE_XREF_TAG)) { 798 section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()}); 799 done = true; 800 } 801 // READ DATE 802 else if (token.equals(DATE_TAG)) { 803 section.add(new String[]{DATE_TAG,line.substring(5).trim()}); 804 done = true; 805 } 806 // READ END OF SEQUENCE 807 else if (token.equals(END_SEQUENCE_TAG)) { 808 section.add(new String[]{END_SEQUENCE_TAG,null}); 809 done = true; 810 } 811 // READ NORMAL TAG/VALUE SECTION 812 else { 813 // rewind buffer to mark 814 br.reset(); 815 // read token/values until first with non-same first character 816 // exceptions: DE/DT, and RN...RN 817 String currentTag = null; 818 char currentTagStart = '\0'; 819 StringBuffer currentVal = null; 820 while (!done) { 821 br.mark(160); 822 line = br.readLine(); 823 if (currentTagStart=='\0') currentTagStart = line.charAt(0); 824 if (!line.startsWith(""+currentTagStart) || 825 (currentTagStart=='D' && currentTag!=null && !line.startsWith(""+currentTag)) || 826 (currentTagStart=='R' && currentTag!=null && line.startsWith("RN"))) { 827 br.reset(); 828 done = true; 829 // dump current tag if exists 830 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 831 } else { 832 try { 833 // merge neighbouring repeated tokens by concatting values 834 // return tag->value pairs 835 String tag = line.substring(0,2); 836 String value = line.substring(5); 837 if (currentTag==null || !tag.equals(currentTag)) { 838 // dump current tag if exists 839 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 840 // start new tag 841 currentTag = tag; 842 currentVal = new StringBuffer(); 843 currentVal.append(value); 844 } else { 845 currentVal.append("\n"); 846 currentVal.append(value); 847 } 848 } catch (Exception e) { 849 String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); 850 throw new ParseException(e, message); 851 } 852 } 853 } 854 } 855 } 856 } catch (IOException e) { 857 String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); 858 throw new ParseException(e, message); 859 } catch (RuntimeException e){ 860 String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); 861 throw new ParseException(e, message); 862 } 863 return section; 864 } 865 866 /** 867 * {@inheritDoc} 868 */ 869 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 870 if (this.getPrintStream()==null) this.setPrintStream(os); 871 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 872 } 873 874 /** 875 * {@inheritDoc} 876 */ 877 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 878 if (this.getPrintStream()==null) this.setPrintStream(os); 879 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 880 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 881 } 882 883 /** 884 * {@inheritDoc} 885 * Namespace is ignored as UniProt has no concept of it. 886 */ 887 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 888 RichSequence rs; 889 try { 890 if (seq instanceof RichSequence) rs = (RichSequence)seq; 891 else rs = RichSequence.Tools.enrich(seq); 892 } catch (ChangeVetoException e) { 893 IOException e2 = new IOException("Unable to enrich sequence"); 894 e2.initCause(e); 895 throw e2; 896 } 897 898 SymbolTokenization tok; 899 try { 900 tok = rs.getAlphabet().getTokenization("token"); 901 } catch (Exception e) { 902 throw new RuntimeException("Unable to get alphabet tokenizer",e); 903 } 904 905 Set<Note> notes = rs.getNoteSet(); 906 String accession = rs.getAccession(); 907 StringBuffer accessions = new StringBuffer(); 908 accessions.append(accession); 909 accessions.append(";"); 910 String cdat = null; 911 String udat = null; 912 String adat = null; 913 String dbname = "?"; 914 String arel = null; 915 String organelle = null; 916 String protExists = null; 917 String dataclass = "STANDARD"; 918 String copyright = null; 919 Map speciesRecs = new TreeMap(); 920 Map strainRecs = new TreeMap(); 921 Map tissueRecs = new TreeMap(); 922 Map transpRecs = new TreeMap(); 923 Map plasmidRecs = new TreeMap(); 924 Map genenames = new TreeMap(); 925 Map genesynonyms = new TreeMap(); 926 Map orfnames = new TreeMap(); 927 Map ordlocnames = new TreeMap(); 928 for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) { 929 Note n = i.next(); 930 if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); 931 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 932 else if (n.getTerm().equals(Terms.getDateAnnotatedTerm())) adat=n.getValue(); 933 else if (n.getTerm().equals(Terms.getUniProtDBNameTerm())) dbname=n.getValue(); 934 else if (n.getTerm().equals(Terms.getProteinExistsTerm())) protExists=n.getValue(); 935 else if (n.getTerm().equals(Terms.getRelAnnotatedTerm())) arel=n.getValue(); 936 else if (n.getTerm().equals(Terms.getDataClassTerm())) dataclass = n.getValue(); 937 else if (n.getTerm().equals(Terms.getCopyrightTerm())) copyright = n.getValue(); 938 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 939 accessions.append(" "); 940 accessions.append(n.getValue()); 941 accessions.append(";"); 942 } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle = (organelle==null?"":organelle+"; ")+n.getValue(); 943 // use the nasty hack to split the reference rank away from the actual value in this field 944 else if (n.getTerm().equals(Terms.getGeneNameTerm())) { 945 String ref = n.getValue(); 946 int colon = ref.indexOf(':'); 947 Integer refID = new Integer(0); 948 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 949 genenames.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene 950 } else if (n.getTerm().equals(Terms.getGeneSynonymTerm())) { 951 String ref = n.getValue(); 952 int colon = ref.indexOf(':'); 953 Integer refID = new Integer(0); 954 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 955 if (genesynonyms.get(refID)==null) genesynonyms.put(refID, new ArrayList()); 956 ((List)genesynonyms.get(refID)).add(ref.substring(colon+1)); 957 } else if (n.getTerm().equals(Terms.getOrderedLocusNameTerm())) { 958 String ref = n.getValue(); 959 int colon = ref.indexOf(':'); 960 Integer refID = new Integer(0); 961 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 962 if (ordlocnames.get(refID)==null) ordlocnames.put(refID, new ArrayList()); 963 ((List)ordlocnames.get(refID)).add(ref.substring(colon+1)); 964 } else if (n.getTerm().equals(Terms.getORFNameTerm())) { 965 String ref = n.getValue(); 966 int colon = ref.indexOf(':'); 967 Integer refID = new Integer(0); 968 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 969 if (orfnames.get(refID)==null) orfnames.put(refID, new ArrayList()); 970 ((List)orfnames.get(refID)).add(ref.substring(colon+1)); 971 } 972 // use the nasty hack to split the reference rank away from the actual value in this field 973 // we'll end up with a bunch in key 0 for those which did not come from us. We ignore these for now. 974 else if (n.getTerm().equals(Terms.getSpeciesTerm())) { 975 String ref = n.getValue(); 976 int colon = ref.indexOf(':'); 977 Integer refID = new Integer(0); 978 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 979 if (speciesRecs.get(refID)==null) speciesRecs.put(refID, new ArrayList()); 980 ((List)speciesRecs.get(refID)).add(ref.substring(colon+1)); 981 } else if (n.getTerm().equals(Terms.getStrainTerm())) { 982 String ref = n.getValue(); 983 int colon = ref.indexOf(':'); 984 Integer refID = new Integer(0); 985 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 986 if (strainRecs.get(refID)==null) strainRecs.put(refID, new ArrayList()); 987 ((List)strainRecs.get(refID)).add(ref.substring(colon+1)); 988 } else if (n.getTerm().equals(Terms.getTissueTerm())) { 989 String ref = n.getValue(); 990 int colon = ref.indexOf(':'); 991 Integer refID = new Integer(0); 992 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 993 if (tissueRecs.get(refID)==null) tissueRecs.put(refID, new ArrayList()); 994 ((List)tissueRecs.get(refID)).add(ref.substring(colon+1)); 995 } else if (n.getTerm().equals(Terms.getTransposonTerm())) { 996 String ref = n.getValue(); 997 int colon = ref.indexOf(':'); 998 Integer refID = new Integer(0); 999 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 1000 if (transpRecs.get(refID)==null) transpRecs.put(refID, new ArrayList()); 1001 ((List)transpRecs.get(refID)).add(ref.substring(colon+1)); 1002 } else if (n.getTerm().equals(Terms.getPlasmidTerm())) { 1003 String ref = n.getValue(); 1004 int colon = ref.indexOf(':'); 1005 Integer refID = new Integer(0); 1006 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 1007 if (plasmidRecs.get(refID)==null) plasmidRecs.put(refID, new ArrayList()); 1008 ((List)plasmidRecs.get(refID)).add(ref.substring(colon+1)); 1009 } 1010 } 1011 1012 // entryname dataclass; [circular] molecule; division; sequencelength BP. 1013 StringBuffer locusLine = new StringBuffer(); 1014 locusLine.append(StringTools.rightPad(rs.getName()+"_"+rs.getDivision(),12)); 1015 locusLine.append(" "); 1016 locusLine.append(StringTools.leftPad(dataclass,19)); 1017 //locusLine.append("; PRT; "); //Uniprot no longer uses the PRT; 1018 locusLine.append("; "); 1019 locusLine.append(StringTools.leftPad(""+rs.length(),11)); 1020 locusLine.append(" AA."); 1021 StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream()); 1022 1023 // accession line 1024 StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream()); 1025 1026 // date line 1027 StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+", integrated into UniProtKB/"+dbname+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); 1028 StringTools.writeKeyValueLine(DATE_TAG, udat+", sequence version "+rs.getVersion()+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); 1029 StringTools.writeKeyValueLine(DATE_TAG, (adat==null?udat:adat)+", entry version "+(arel==null?"0":arel)+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); 1030 1031 // definition line 1032 StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription()+".", 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream()); 1033 1034 // gene line 1035 for (Iterator i = genenames.keySet().iterator(); i.hasNext(); ) { 1036 Integer geneid = (Integer)i.next(); 1037 String genename = (String)genenames.get(geneid); 1038 List synonyms = (List)genesynonyms.get(geneid); 1039 List orfs = (List)orfnames.get(geneid); 1040 List ordlocs = (List)ordlocnames.get(geneid); 1041 1042 StringBuffer gnline = new StringBuffer(); 1043 gnline.append(Terms.GENENAME_KEY); 1044 gnline.append("="); 1045 gnline.append(genename); 1046 gnline.append("; "); 1047 1048 if (synonyms!=null) { 1049 gnline.append(Terms.GENESYNONYM_KEY); 1050 gnline.append("="); 1051 for (Iterator j = synonyms.iterator(); j.hasNext(); ) { 1052 gnline.append((String)j.next()); 1053 if (j.hasNext()) gnline.append(", "); 1054 } 1055 gnline.append("; "); 1056 } 1057 if (ordlocs!=null) { 1058 gnline.append(Terms.ORDLOCNAME_KEY); 1059 gnline.append("="); 1060 for (Iterator j = ordlocs.iterator(); j.hasNext(); ) { 1061 gnline.append((String)j.next()); 1062 if (j.hasNext()) gnline.append(", "); 1063 } 1064 gnline.append("; "); 1065 } 1066 if (orfs!=null) { 1067 gnline.append(Terms.ORFNAME_KEY); 1068 gnline.append("="); 1069 for (Iterator j = orfs.iterator(); j.hasNext(); ) { 1070 gnline.append((String)j.next()); 1071 if (j.hasNext()) gnline.append(", "); 1072 } 1073 gnline.append("; "); 1074 } 1075 1076 StringTools.writeKeyValueLine(GENE_TAG, gnline.toString(), 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream()); 1077 1078 if (i.hasNext()) StringTools.writeKeyValueLine(GENE_TAG, "and", 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream()); 1079 } 1080 1081 // source line (from taxon) 1082 // organism line 1083 NCBITaxon tax = rs.getTaxon(); 1084 if (tax!=null) { 1085 StringBuffer source = new StringBuffer(); 1086 source.append(tax.getDisplayName()); 1087 for (Iterator j = tax.getNames(NCBITaxon.SYNONYM).iterator(); j.hasNext(); ) { 1088 source.append(" ("); 1089 source.append((String)j.next()); 1090 source.append(")"); 1091 } 1092 source.append("."); 1093 StringTools.writeKeyValueLine(SOURCE_TAG, source.toString(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); 1094 if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle+".", 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream()); 1095 StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, ORGANISM_TAG, this.getPrintStream()); 1096 StringTools.writeKeyValueLine(TAXON_TAG, "NCBI_TaxID="+tax.getNCBITaxID()+";", 5, this.getLineWidth(), this.getPrintStream()); 1097 } 1098 1099 // references - rank (bases x to y) 1100 for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { 1101 RankedDocRef rdr = r.next(); 1102 DocRef d = rdr.getDocumentReference(); 1103 // RN, RP, RC, RX, RG, RA, RT, RL 1104 StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream()); 1105 if (d.getRemark()!=null) 1106 StringTools.writeKeyValueLine(RP_LINE_TAG, d.getRemark()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream()); 1107 // Print out ref position if present 1108 if (rdr.getStart()!=null && rdr.getEnd()!=null && d.getRemark()!=null && !rppat.matcher(d.getRemark()).matches()) StringTools.writeKeyValueLine(RP_LINE_TAG, "SEQUENCE OF "+rdr.getStart()+"-"+rdr.getEnd()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream()); 1109 // RC lines 1110 StringBuffer rcline = new StringBuffer(); 1111 Integer rank = new Integer(rdr.getRank()); 1112 if (speciesRecs.get(rank)!=null) { 1113 rcline.append(Terms.SPECIES_KEY); 1114 rcline.append("="); 1115 for (Iterator i = ((List)speciesRecs.get(rank)).iterator(); i.hasNext(); ) { 1116 rcline.append((String)i.next()); 1117 if (i.hasNext()) rcline.append(", "); 1118 } 1119 rcline.append("; "); 1120 } 1121 if (strainRecs.get(rank)!=null) { 1122 rcline.append(Terms.STRAIN_KEY); 1123 rcline.append("="); 1124 for (Iterator i = ((List)strainRecs.get(rank)).iterator(); i.hasNext(); ) { 1125 rcline.append((String)i.next()); 1126 if (i.hasNext()) rcline.append(", "); 1127 } 1128 rcline.append("; "); 1129 } 1130 if (tissueRecs.get(rank)!=null) { 1131 rcline.append(Terms.TISSUE_KEY); 1132 rcline.append("="); 1133 for (Iterator i = ((List)tissueRecs.get(rank)).iterator(); i.hasNext(); ) { 1134 rcline.append((String)i.next()); 1135 if (i.hasNext()) rcline.append(", "); 1136 } 1137 rcline.append("; "); 1138 } 1139 if (transpRecs.get(rank)!=null) { 1140 rcline.append(Terms.TRANSPOSON_KEY); 1141 rcline.append("="); 1142 for (Iterator i = ((List)transpRecs.get(rank)).iterator(); i.hasNext(); ) { 1143 rcline.append((String)i.next()); 1144 if (i.hasNext()) rcline.append(", "); 1145 } 1146 rcline.append("; "); 1147 } 1148 if (plasmidRecs.get(rank)!=null) { 1149 rcline.append(Terms.PLASMID_KEY); 1150 rcline.append("="); 1151 for (Iterator i = ((List)plasmidRecs.get(rank)).iterator(); i.hasNext(); ) { 1152 rcline.append((String)i.next()); 1153 if (i.hasNext()) rcline.append(", "); 1154 } 1155 rcline.append("; "); 1156 } 1157 // print the rcline 1158 if (rcline.length()>0) StringTools.writeKeyValueLine(RC_LINE_TAG, rcline.toString(), 5, this.getLineWidth(), null, RC_LINE_TAG, this.getPrintStream()); 1159 // Deal with RX and rest 1160 CrossRef c = d.getCrossref(); 1161 if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"="+c.getAccession()+";", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream()); 1162 List<DocRefAuthor> auths = d.getAuthorList(); 1163 for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) { 1164 DocRefAuthor a = j.next(); 1165 if (a.isConsortium()) { 1166 StringTools.writeKeyValueLine(CONSORTIUM_TAG, a.getName()+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream()); 1167 j.remove(); 1168 } 1169 } 1170 if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, false)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); 1171 if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); 1172 StringTools.writeKeyValueLine(LOCATION_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATION_TAG, this.getPrintStream()); 1173 } 1174 1175 // comments - if any 1176 if (!rs.getComments().isEmpty()) { 1177 for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) { 1178 Comment c = i.next(); 1179 String text = c.getComment().trim(); 1180 if (text.length()>3 && text.substring(0,3).equals("-!-")) StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); 1181 else StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); 1182 } 1183 } 1184 1185 // copyright - if any 1186 if (copyright!=null) 1187 StringTools.writeKeyValueLine(COMMENT_TAG, copyright, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); 1188 1189 // db references - ranked 1190 for (Iterator<RankedCrossRef> r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) { 1191 RankedCrossRef rcr = r.next(); 1192 CrossRef c = rcr.getCrossRef(); 1193 Set<Note> noteset = c.getNoteSet(); 1194 StringBuffer sb = new StringBuffer(); 1195 sb.append(c.getDbname()); 1196 sb.append("; "); 1197 sb.append(c.getAccession()); 1198 boolean hasSecondary = false; 1199 for (Iterator<Note> i = noteset.iterator(); i.hasNext(); ) { 1200 Note n = i.next(); 1201 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 1202 sb.append("; "); 1203 sb.append(n.getValue()); 1204 hasSecondary = true; 1205 } 1206 } 1207 if (!hasSecondary) sb.append("; -"); 1208 sb.append("."); 1209 StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream()); 1210 } 1211 1212 // protein exists line 1213 if (protExists!=null) { 1214 StringTools.writeKeyValueLine(PROTEIN_EXIST_TAG, protExists+";", 5, this.getLineWidth(), null, PROTEIN_EXIST_TAG, this.getPrintStream()); 1215 } 1216 1217 // keywords line 1218 String keywords = null; 1219 for (Iterator<Note> n = notes.iterator(); n.hasNext(); ) { 1220 Note nt = n.next(); 1221 if (nt.getTerm().equals(Terms.getKeywordTerm())) { 1222 if (keywords==null) keywords = nt.getValue(); 1223 else keywords = keywords+"; "+nt.getValue(); 1224 } 1225 } 1226 if (keywords!=null) { 1227 StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords+".", 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream()); 1228 } 1229 1230 // feature_type location 1231 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 1232 RichFeature f = (RichFeature)i.next(); 1233 String desc = ""; 1234 String ftid = null; 1235 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) { 1236 Note n = j.next(); 1237 if (n.getTerm().equals(Terms.getFTIdTerm())) ftid = n.getValue(); 1238 else if (n.getTerm().equals(Terms.getFeatureDescTerm())) desc = n.getValue(); 1239 } 1240 String kw = f.getTypeTerm().getName(); 1241 String leader = StringTools.rightPad(kw,8)+" "+UniProtLocationParser.writeLocation((RichLocation)f.getLocation()); 1242 if(desc.length()==0) 1243 this.getPrintStream().println(FEATURE_TAG+" "+leader); //see #2277 1244 else 1245 StringTools.writeKeyValueLine(FEATURE_TAG+" "+leader, desc+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1246 if (ftid!=null) StringTools.writeKeyValueLine(FEATURE_TAG, "/FTId="+ftid+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1247 } 1248 1249 // sequence header 1250 int mw = 0; 1251 try { 1252 mw = (int)MassCalc.getMolecularWeight(rs); 1253 } catch (IllegalSymbolException e) { 1254 throw new RuntimeException("Found illegal symbol", e); 1255 } 1256 CRC64Checksum crc = new CRC64Checksum(); 1257 String seqstr = rs.seqString(); 1258 crc.update(seqstr.getBytes(),0,seqstr.length()); 1259 this.getPrintStream().print(START_SEQUENCE_TAG+" SEQUENCE "+StringTools.leftPad(""+rs.length(),4)+" AA; "); 1260 this.getPrintStream().print(StringTools.leftPad(""+mw,5)+" MW; "); 1261 this.getPrintStream().println(crc+" CRC64;"); 1262 1263 // sequence stuff 1264 Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); 1265 int symCount = 0; 1266 this.getPrintStream().print(" "); 1267 for (int i = 0; i < syms.length; i++) { 1268 if (symCount % 60 == 0 && symCount>0) { 1269 this.getPrintStream().print("\n "); 1270 } 1271 if (symCount % 10 == 0) { 1272 this.getPrintStream().print(" "); 1273 } 1274 try { 1275 this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); 1276 } catch (IllegalSymbolException e) { 1277 throw new RuntimeException("Found illegal symbol: "+syms[i]); 1278 } 1279 symCount++; 1280 } 1281 this.getPrintStream().print("\n"); 1282 this.getPrintStream().println(END_SEQUENCE_TAG); 1283 } 1284 1285 /** 1286 * {@inheritDoc} 1287 */ 1288 public String getDefaultFormat() { 1289 return UNIPROT_FORMAT; 1290 } 1291 1292 /** 1293 * Converts the current parse section to a String. Useful for debugging. 1294 */ 1295 String sectionToString(List section){ 1296 StringBuffer parseBlock = new StringBuffer(); 1297 for(Iterator i = section.listIterator(); i.hasNext();){ 1298 String[] part = (String[])i.next(); 1299 for(int x = 0; x < part.length; x++){ 1300 parseBlock.append(part[x]); 1301 if(x == 0){ 1302 parseBlock.append(" "); //the gap will have been trimmed 1303 } 1304 } 1305 } 1306 return parseBlock.toString(); 1307 } 1308}