001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This 005 * should be distributed with the code. If you do not have a copy, see: 006 * 007 * http://www.gnu.org/copyleft/lesser.html 008 * 009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments. 010 * 011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page 012 * at: 013 * 014 * http://www.biojava.org/ 015 * 016 * Created on August 13, 2010 Author: Mark Chapman 017 */ 018 019package org.biojava.nbio.alignment.io; 020 021import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 022import org.biojava.nbio.core.sequence.ProteinSequence; 023import org.biojava.nbio.core.sequence.RNASequence; 024import org.biojava.nbio.core.sequence.template.AbstractCompound; 025import org.biojava.nbio.core.sequence.template.AbstractSequence; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Map; 033 034/** 035 * Stores all the content of a Stockholm file. <i><b>N.B.: This structure will undergo several enhancements later on. 036 * Don't depend on it in a final code, otherwise it will be hard to maintain.</b></i> 037 * 038 * In general, Stockholm File contains the alignment mark-up lines.<br> 039 * <br> 040 * 041 * <table> 042 * <caption></caption> 043 * <tr> 044 * <td><b>Header Section</b></td> 045 * </tr> 046 * <tr> 047 * <td><b>Reference Section</b></td> 048 * </tr> 049 * <tr> 050 * <td><b>Comment Section</b></td> 051 * </tr> 052 * <tr> 053 * <td><B>Alignment Section</B></td> 054 * </tr> 055 * </table> 056 * 057 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".<br> 058 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.<br> 059 * 060 * <table> 061 * <caption></caption> 062 * <tr> 063 * <th>section field</th> 064 * <th>preferred location</th> 065 * </tr> 066 * <tr> 067 * <td>#=GF <feature> <Generic per-File annotation, free text></td> 068 * <td>Above the alignment</td> 069 * </tr> 070 * <tr> 071 * <td>#=GC <feature> <Generic per-Column annotation, exactly 1 char per column></td> 072 * <td>Below the alignment</td> 073 * </tr> 074 * <tr> 075 * <td>#=GS <seqname> <feature> <Generic per-Sequence annotation, free text></td> 076 * <td>Above the alignment or just below the corresponding sequence</td> 077 * </tr> 078 * <tr> 079 * <td>#=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue></td> 080 * <td>Just below the corresponding sequence</td> 081 * </tr> 082 * </table> 083 * 084 * @since 3.0.5 085 * @author Amr ALHOSSARY 086 * @author Marko Vaz 087 * 088 */ 089public class StockholmStructure { 090 091 private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class); 092 093 public static final String PFAM = "PFAM"; 094 public static final String RFAM = "RFAM"; 095 private final StockholmFileAnnotation fileAnnotation; 096 private final StockholmConsensusAnnotation consAnnotation; 097 private final Map<String, StringBuffer> sequences; 098 private final Map<String, StockholmSequenceAnnotation> seqsAnnotation; 099 private final Map<String, StockholmResidueAnnotation> resAnnotation; 100 101 public StockholmStructure() { 102 fileAnnotation = new StockholmFileAnnotation(); 103 consAnnotation = new StockholmConsensusAnnotation(); 104 sequences = new HashMap<>(); 105 seqsAnnotation = new HashMap<>(); 106 resAnnotation = new HashMap<>(); 107 } 108 109 public StockholmFileAnnotation getFileAnnotation() { 110 return fileAnnotation; 111 } 112 113 public StockholmConsensusAnnotation getConsAnnotation() { 114 return consAnnotation; 115 } 116 117 /** 118 * @param seqName 119 * @param seqText 120 */ 121 public void appendToSequence(String seqName, String seqText) { 122 StringBuffer seq = sequences.get(seqName); 123 if (seq != null) { 124 // add sequence without space 125 seq.append(seqText); 126 } else { 127 seq = new StringBuffer(seqText); 128 sequences.put(seqName, seq); 129 } 130 } 131 132 public Map<String, StringBuffer> getSequences() { 133 return sequences; 134 } 135 136 private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) { 137 if (!seqsAnnotation.containsKey(seqName)) { 138 seqsAnnotation.put(seqName, new StockholmSequenceAnnotation()); 139 } 140 return seqsAnnotation.get(seqName); 141 } 142 143 /** 144 * @param seqName 145 * @param text 146 */ 147 public void addGSAccessionNumber(String seqName, String text) { 148 getSequenceAnnotation(seqName).setAccessionNumber(text); 149 } 150 151 public void addGSDescription(String seqName, String text) { 152 getSequenceAnnotation(seqName).addToDescription(text); 153 } 154 155 /** 156 * @param seqName 157 * @param text 158 */ 159 public void addGSdbReference(String seqName, String text) { 160 getSequenceAnnotation(seqName).addDBReference(text); 161 } 162 163 public void addGSOrganismSpecies(String seqName, String text) { 164 getSequenceAnnotation(seqName).setOrganism(text); 165 } 166 167 public void addGSOrganismClassification(String seqName, String text) { 168 getSequenceAnnotation(seqName).setOrganismClassification(text); 169 } 170 171 public void addGSLook(String seqName, String text) { 172 getSequenceAnnotation(seqName).setLook(text); 173 } 174 175 private StockholmResidueAnnotation getResidueAnnotation(String seqName) { 176 if (!resAnnotation.containsKey(seqName)) { 177 resAnnotation.put(seqName, new StockholmResidueAnnotation()); 178 } 179 return resAnnotation.get(seqName); 180 } 181 182 public void addSurfaceAccessibility(String seqName, String text) { 183 getResidueAnnotation(seqName).setSurfaceAccessibility(text); 184 } 185 186 public void addTransMembrane(String seqName, String text) { 187 getResidueAnnotation(seqName).setTransMembrane(text); 188 } 189 190 public void addPosteriorProbability(String seqName, String text) { 191 getResidueAnnotation(seqName).setPosteriorProbability(text); 192 } 193 194 public void addLigandBinding(String seqName, String text) { 195 getResidueAnnotation(seqName).setLigandBinding(text); 196 } 197 198 public void addActiveSite(String seqName, String text) { 199 getResidueAnnotation(seqName).setActiveSite(text); 200 } 201 202 public void addASPFamPredicted(String seqName, String text) { 203 getResidueAnnotation(seqName).setAsPFamPredicted(text); 204 } 205 206 public void addASSwissProt(String seqName, String text) { 207 getResidueAnnotation(seqName).setAsSwissProt(text); 208 } 209 210 public void addIntron(String seqName, String text) { 211 getResidueAnnotation(seqName).setIntron(text); 212 } 213 214 public void addSecondaryStructure(String seqName, String text) { 215 getResidueAnnotation(seqName).setSecondaryStructure(text); 216 } 217 218 /** 219 * used to retrieve sequences from the structure 220 * 221 * @return Biosequences (case sensitive) 222 * @see #getBioSequences(boolean) 223 * @see #getBioSequences(boolean, String) 224 */ 225 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences() { 226 return getBioSequences(false); 227 } 228 229 /** 230 * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM). 231 * 232 * @param ignoreCase 233 * if <code>true</code>, the function will deal with small letters as if they are capital ones 234 * @param forcedSequenceType 235 * either <code>null</code>, {@link #PFAM}, or {@link #RFAM}. 236 * @return Biosequences according to the criteria specified 237 * @see #getBioSequences() 238 * @see #getBioSequences(boolean) 239 */ 240 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase, 241 String forcedSequenceType) { 242 if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) || forcedSequenceType.equals(RFAM))) { 243 throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType); 244 } 245 List<AbstractSequence<? extends AbstractCompound>> seqs = new ArrayList<>(); 246 for (String sequencename : sequences.keySet()) { 247 AbstractSequence<? extends AbstractCompound> seq = null; 248 String sequence = sequences.get(sequencename).toString(); 249 if (ignoreCase) { 250 sequence = sequence.toUpperCase(); 251 } 252 253 try { 254 if (forcedSequenceType == null) 255 seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence); 256 else if (forcedSequenceType.equals(PFAM)) 257 seq = new ProteinSequence(sequence); 258 else 259 seq = new RNASequence(sequence); 260 } catch (CompoundNotFoundException e) { 261 logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename); 262 continue; 263 } 264 String[] seqDetails = splitSeqName(sequencename); 265 seq.setDescription(seqDetails[0]); 266 seq.setBioBegin((seqDetails[1] == null || "".equals(seqDetails[1].trim()) ? null : Integer.valueOf(seqDetails[1]))); 267 seq.setBioEnd((seqDetails[2] == null || "".equals(seqDetails[2].trim()) ? null : Integer.valueOf(seqDetails[2]))); 268 269 seqs.add(seq); 270 } 271 return seqs; 272 } 273 274 /** 275 * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence 276 * TKRA_BACSU/6-322), this function is used to ignore the small letters case. 277 * 278 * @param ignoreCase 279 * @return 280 * @see #getBioSequences() 281 * @see #getBioSequences(boolean, String) 282 */ 283 public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase) { 284 return getBioSequences(ignoreCase, null); 285 } 286 287 /** 288 * Returns an array with the following sequence related content: name, start, end. 289 * 290 * @param sequenceName 291 * the sequence from where to extract the content. It is supposed that it follows the following 292 * convention name/start-end (e.g.: COATB_BPIKE/30-81) 293 * @return array with the following sequence related content: name, start, end. 294 */ 295 private String[] splitSeqName(String sequenceName) { 296 String[] result = new String[3]; 297 298 String[] barSplit = sequenceName.split("/"); 299 if (barSplit.length == 2) { 300 result[0] = barSplit[0]; 301 String[] positions = barSplit[1].split("-"); 302 if (positions.length == 2) { 303 result[1] = positions[0]; 304 result[2] = positions[1]; 305 } 306 } else { 307 result[0] = sequenceName; 308 result[1] = null; 309 result[2] = null; 310 } 311 312 return result; 313 } 314 315 @Override 316 public String toString() { 317 StringBuffer result = new StringBuffer(); 318 List<AbstractSequence<? extends AbstractCompound>> bioSeqs = getBioSequences(false); 319 int sequenceLength = -1; 320 for (AbstractSequence<? extends AbstractCompound> sequence : bioSeqs) { 321 String sequenceAsString = sequence.getSequenceAsString(); 322 sequenceLength = sequenceAsString.length(); 323 if (sequenceLength > 50) { 324 result.append(sequenceAsString.substring(0, 40)); 325 result.append("..."); 326 result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength)); 327 } else { 328 result.append(sequenceAsString); 329 } 330 result.append(" " + sequence.getDescription() + "\n"); 331 } 332 result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns"); 333 334 return result.toString(); 335 } 336 337 public static class DatabaseReference { 338 public static final String EXPERT = "EXPERT"; 339 public static final String MIM = "MIM"; 340 public static final String PFAMB = "PFAMB"; 341 public static final String PRINTS = "PRINTS"; 342 public static final String PROSITE = "PROSITE"; 343 public static final String PROSITE_PROFILE = "PROSITE_PROFILE"; 344 public static final String SCOP = "SCOP"; 345 public static final String PDB = "PDB"; 346 public static final String SMART = "SMART"; 347 public static final String URL = "URL"; 348 public static final String LOAD = "LOAD"; 349 public static final String HOMSTRAD = "HOMSTRAD"; 350 public static final String INTERPRO = "INTERPRO"; 351 352 private final String database; 353 /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */ 354 private final String reference; 355 356 public DatabaseReference(String database, String reference) { 357 this.database = database; 358 this.reference = reference; 359 } 360 361 public DatabaseReference(String representativeAnnotationString) { 362 int semiColonIndex = representativeAnnotationString.indexOf(';'); 363 this.database = representativeAnnotationString.substring(0, semiColonIndex); 364 this.reference = representativeAnnotationString.substring(semiColonIndex + 1, 365 representativeAnnotationString.lastIndexOf(';')).trim(); 366 } 367 368 @Override 369 public String toString() { 370 return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';') 371 .toString(); 372 } 373 374 public String getDatabase() { 375 return database; 376 } 377 378 public String getReference() { 379 return reference; 380 } 381 } 382}