001/** 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 2013-05-28 021 * Created by Douglas Myers-Turnbull 022 * 023 * @since 3.0.6 024 */ 025package org.biojava.nbio.structure.io; 026 027import org.biojava.nbio.core.alignment.template.AlignedSequence; 028import org.biojava.nbio.core.alignment.template.SequencePair; 029import org.biojava.nbio.structure.*; 030import org.biojava.nbio.structure.align.model.AFPChain; 031import org.biojava.nbio.structure.align.util.AlignmentTools; 032import org.biojava.nbio.structure.align.xml.AFPChainXMLConverter; 033import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 034import org.biojava.nbio.core.sequence.ProteinSequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 037import org.biojava.nbio.core.sequence.io.CasePreservingProteinSequenceCreator; 038import org.biojava.nbio.core.sequence.io.FastaReader; 039import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser; 040import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 041import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 042import org.biojava.nbio.core.sequence.template.Sequence; 043import org.biojava.nbio.core.util.SequenceTools; 044import org.slf4j.Logger; 045import org.slf4j.LoggerFactory; 046 047import java.io.File; 048import java.io.FileInputStream; 049import java.io.IOException; 050import java.io.InputStream; 051import java.util.*; 052 053/** 054 * A collection of static utilities to convert between {@link AFPChain AFPChains} and FastaSequences. 055 * 056 * @author dmyersturnbull 057 * @see StructureSequenceMatcher 058 * @see FastaStructureParser 059 * @see SeqRes2AtomAligner 060 */ 061public class FastaAFPChainConverter { 062 063 private final static Logger logger = LoggerFactory.getLogger(FastaAFPChainConverter.class); 064 065 066 public static AFPChain cpFastaToAfpChain(String first, String second, Structure structure, int cpSite) throws StructureException, CompoundNotFoundException { 067 ProteinSequence s1 = new ProteinSequence(first); 068 s1.setUserCollection(getAlignedUserCollection(first)); 069 ProteinSequence s2 = new ProteinSequence(second); 070 s2.setUserCollection(getAlignedUserCollection(second)); 071 return cpFastaToAfpChain(s1, s2, structure, cpSite); 072 } 073 074 /** 075 * Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site 076 * <code>cpSite</code> residues to the right. 077 * 078 * @param fastaFile A FASTA file containing exactly 2 sequences, the first unpermuted and the second permuted 079 * @param cpSite 080 * The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence 081 * are acceptable 082 * @throws IOException 083 * @throws StructureException 084 */ 085 public static AFPChain cpFastaToAfpChain(File fastaFile, Structure structure, int cpSite) throws IOException, StructureException { 086 InputStream inStream = new FileInputStream(fastaFile); 087 SequenceCreatorInterface<AminoAcidCompound> creator = new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()); 088 SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser = new GenericFastaHeaderParser<>(); 089 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<>(inStream, headerParser, creator); 090 Map<String, ProteinSequence> sequences = fastaReader.process(); 091 inStream.close(); 092 Iterator<ProteinSequence> iter = sequences.values().iterator(); 093 ProteinSequence first = iter.next(); 094 ProteinSequence second = iter.next(); 095 return cpFastaToAfpChain(first, second, structure, cpSite); 096 } 097 098 /** 099 * Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site 100 * <code>cpSite</code> residues to the right. 101 * 102 * @param first The unpermuted sequence 103 * @param second The sequence permuted by cpSite 104 * @param cpSite 105 * The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence 106 * are acceptable 107 * @throws StructureException 108 */ 109 public static AFPChain cpFastaToAfpChain(ProteinSequence first, ProteinSequence second, Structure structure, int cpSite) 110 throws StructureException { 111 112 if (structure == null) { 113 throw new IllegalArgumentException("The structure is null"); 114 } 115 116 if (first == null) { 117 throw new IllegalArgumentException("The sequence is null"); 118 } 119 120 // we need to find the ungapped CP site 121 int gappedCpShift = 0; 122 int ungappedCpShift = 0; 123 while (ungappedCpShift < Math.abs(cpSite)) { 124 char c; 125 try { 126 if (cpSite <= 0) { 127 c = second.getSequenceAsString().charAt(gappedCpShift); 128 } else { 129 c = second.getSequenceAsString().charAt(first.getLength()-1 - gappedCpShift); 130 } 131 } catch (StringIndexOutOfBoundsException e) { 132 throw new IllegalArgumentException("CP site of " + cpSite + " is wrong"); 133 } 134 if (c != '-') { 135 ungappedCpShift++; 136 } 137 gappedCpShift++; 138 } 139 140 Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure); 141 Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure); // can't use cloneCAArray because it doesn't set parent group.chain.structure 142 143 ProteinSequence antipermuted = null; 144 try { 145 antipermuted = new ProteinSequence(SequenceTools.permuteCyclic(second.getSequenceAsString(), gappedCpShift)); 146 } catch (CompoundNotFoundException e) { 147 // this can't happen, the original sequence comes from a ProteinSequence 148 logger.error("Unexpected error while creating protein sequence: {}. This is most likely a bug.",e.getMessage() ); 149 } 150 151 ResidueNumber[] residues = StructureSequenceMatcher.matchSequenceToStructure(first, structure); 152 ResidueNumber[] antipermutedResidues = StructureSequenceMatcher.matchSequenceToStructure(antipermuted, structure); 153 154 ResidueNumber[] nonpermutedResidues = new ResidueNumber[antipermutedResidues.length]; 155 SequenceTools.permuteCyclic(antipermutedResidues, nonpermutedResidues, -gappedCpShift); 156 157 // nullify ResidueNumbers that have a lowercase sequence character 158 if (first.getUserCollection() != null) { 159 CasePreservingProteinSequenceCreator.setLowercaseToNull(first, residues); 160 } 161 if (second.getUserCollection() != null) { 162 CasePreservingProteinSequenceCreator.setLowercaseToNull(second, nonpermutedResidues); 163 } 164 165// for (int i = 0; i < residues.length; i++) { 166// if (residues[i] == null) { 167// System.out.print("="); 168// } else { 169// System.out.print(sequence.getSequenceAsString().charAt(i)); 170// } 171// } 172// System.out.println(); 173// for (int i = 0; i < residues.length; i++) { 174// if (nonpermutedResidues[i] == null) { 175// System.out.print("="); 176// } else { 177// System.out.print(second.getSequenceAsString().charAt(i)); 178// } 179// } 180// System.out.println(); 181 182 return buildAlignment(ca1, ca2, residues, nonpermutedResidues); 183 184 } 185 186 /** 187 * Reads the file {@code fastaFile}, expecting exactly two sequences which give a pairwise alignment. Uses this and two structures to create an AFPChain corresponding to the alignment. Uses a 188 * {@link CasePreservingProteinSequenceCreator} and assumes that a residue is aligned if and only if it is given by an uppercase letter. 189 * 190 * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) 191 * @throws IOException 192 * @throws StructureException 193 */ 194 public static AFPChain fastaFileToAfpChain(File fastaFile, Structure structure1, Structure structure2) 195 throws IOException, StructureException { 196 InputStream inStream = new FileInputStream(fastaFile); 197 SequenceCreatorInterface<AminoAcidCompound> creator = new CasePreservingProteinSequenceCreator( 198 AminoAcidCompoundSet.getAminoAcidCompoundSet()); 199 SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser = new GenericFastaHeaderParser<>(); 200 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<>( 201 inStream, headerParser, creator); 202 Map<String, ProteinSequence> sequences = fastaReader.process(); 203 inStream.close(); 204 return fastaToAfpChain(sequences, structure1, structure2); 205 } 206 207 /** 208 * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The 209 * sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures. 210 * @throws StructureException 211 * @throws CompoundNotFoundException 212 */ 213 public static AFPChain fastaStringToAfpChain(String sequence1, String sequence2, Structure structure1, 214 Structure structure2) throws StructureException, CompoundNotFoundException { 215 ProteinSequence seq1 = new ProteinSequence(sequence1); 216 ProteinSequence seq2 = new ProteinSequence(sequence2); 217 return fastaToAfpChain(seq1, seq2, structure1, structure2); 218 } 219 220 /** 221 * Uses two sequences each with a corresponding structure to create an AFPChain corresponding to the alignment. Provided only for convenience since FastaReaders return such maps. 222 * 223 * @param sequences 224 * A Map containing exactly two entries from sequence names as Strings to gapped ProteinSequences; the name is ignored 225 * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) 226 * @throws StructureException 227 */ 228 public static AFPChain fastaToAfpChain(Map<String, ProteinSequence> sequences, Structure structure1, 229 Structure structure2) throws StructureException { 230 231 if (sequences.size() != 2) { 232 throw new IllegalArgumentException("There must be exactly 2 sequences, but there were " + sequences.size()); 233 } 234 235 if (structure1 == null || structure2 == null) { 236 throw new IllegalArgumentException("A structure is null"); 237 } 238 239 List<ProteinSequence> seqs = new ArrayList<>(); 240 List<String> names = new ArrayList<>(2); 241 for (Map.Entry<String, ProteinSequence> entry : sequences.entrySet()) { 242 seqs.add(entry.getValue()); 243 names.add(entry.getKey()); 244 } 245 246 return fastaToAfpChain(seqs.get(0), seqs.get(1), structure1, structure2); 247 } 248 249 /** 250 * TODO Write comment 251 * @param sequence1 252 * @param sequence2 253 * @param structure1 254 * @param structure2 255 * @return 256 * @throws StructureException 257 * @throws CompoundNotFoundException 258 */ 259 public static AFPChain fastaToAfpChain(String sequence1, String sequence2, Structure structure1, 260 Structure structure2) throws StructureException, CompoundNotFoundException { 261 ProteinSequence s1 = new ProteinSequence(sequence1); 262 s1.setUserCollection(getAlignedUserCollection(sequence1)); 263 ProteinSequence s2 = new ProteinSequence(sequence2); 264 s2.setUserCollection(getAlignedUserCollection(sequence2)); 265 return fastaToAfpChain(s1, s2, structure1, structure2); 266 } 267 268 /** 269 * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The 270 * sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures. Assumes that a residue is aligned if and only if it is given by 271 * an uppercase letter. 272 * @param sequence1 <em>Must</em> have {@link ProteinSequence#getUserCollection()} set to document upper- and lower-case as aligned and unaligned; see {@link #getAlignedUserCollection(String)} 273 * @throws StructureException 274 */ 275 public static AFPChain fastaToAfpChain(ProteinSequence sequence1, ProteinSequence sequence2, Structure structure1, 276 Structure structure2) throws StructureException { 277 278 if (structure1 == null || structure2 == null) { 279 throw new IllegalArgumentException("A structure is null"); 280 } 281 282 if (sequence1 == null || sequence2 == null) { 283 throw new IllegalArgumentException("A sequence is null"); 284 } 285 286 Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure1); 287 Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure2); 288 289 ResidueNumber[] residues1 = StructureSequenceMatcher.matchSequenceToStructure(sequence1, structure1); 290 ResidueNumber[] residues2 = StructureSequenceMatcher.matchSequenceToStructure(sequence2, structure2); 291 292 // nullify ResidueNumbers that have a lowercase sequence character 293 if (sequence1.getUserCollection() != null) { 294 CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence1, residues1); 295 } 296 if (sequence2.getUserCollection() != null) { 297 CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence2, residues2); 298 } 299 300 return buildAlignment(ca1, ca2, residues1, residues2); 301 302 } 303 304 /** 305 * Provided only for convenience. 306 * 307 * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) 308 * @throws StructureException 309 */ 310 public static AFPChain fastaToAfpChain(SequencePair<Sequence<AminoAcidCompound>, AminoAcidCompound> alignment, 311 Structure structure1, Structure structure2) throws StructureException { 312 List<AlignedSequence<Sequence<AminoAcidCompound>, AminoAcidCompound>> seqs = alignment.getAlignedSequences(); 313 StringBuilder sb1 = new StringBuilder(); 314 for (AminoAcidCompound a : seqs.get(0)) { 315 sb1.append(a.getBase()); 316 } 317 try { 318 ProteinSequence seq1 = new ProteinSequence(sb1.toString()); 319 StringBuilder sb2 = new StringBuilder(); 320 for (AminoAcidCompound a : seqs.get(1)) { 321 sb1.append(a.getBase()); 322 } 323 ProteinSequence seq2 = new ProteinSequence(sb2.toString()); 324 LinkedHashMap<String, ProteinSequence> map = new LinkedHashMap<>(); 325 map.put(structure1.getName(), seq1); 326 map.put(structure2.getName(), seq2); 327 return fastaToAfpChain(map, structure1, structure2); 328 } catch (CompoundNotFoundException e) { 329 logger.error("Unexpected error while creating protein sequences: {}. This is most likely a bug.",e.getMessage()); 330 return null; 331 } 332 } 333 334 /** 335 * Builds an {@link AFPChain} from already-matched arrays of atoms and residues. 336 * 337 * @param ca1 338 * An array of atoms in the first structure 339 * @param ca2 340 * An array of atoms in the second structure 341 * @param residues1 342 * An array of {@link ResidueNumber ResidueNumbers} in the first structure that are aligned. Only null ResidueNumbers are considered to be unaligned 343 * @param residues2 344 * An array of {@link ResidueNumber ResidueNumbers} in the second structure that are aligned. Only null ResidueNumbers are considered to be unaligned 345 * @throws StructureException 346 */ 347 private static AFPChain buildAlignment(Atom[] ca1, Atom[] ca2, ResidueNumber[] residues1, ResidueNumber[] residues2) 348 throws StructureException { 349 350 // remove any gap 351 // this includes the ones introduced by the nullifying above 352 List<ResidueNumber> alignedResiduesList1 = new ArrayList<>(); 353 List<ResidueNumber> alignedResiduesList2 = new ArrayList<>(); 354 for (int i = 0; i < residues1.length; i++) { 355 if (residues1[i] != null && residues2[i] != null) { 356 alignedResiduesList1.add(residues1[i]); 357 alignedResiduesList2.add(residues2[i]); 358 } 359 } 360 361 ResidueNumber[] alignedResidues1 = alignedResiduesList1.toArray(new ResidueNumber[alignedResiduesList1.size()]); 362 ResidueNumber[] alignedResidues2 = alignedResiduesList2.toArray(new ResidueNumber[alignedResiduesList2.size()]); 363 364 AFPChain afpChain = AlignmentTools.createAFPChain(ca1, ca2, alignedResidues1, alignedResidues2); 365 afpChain.setAlgorithmName("unknown"); 366 367 AlignmentTools.updateSuperposition(afpChain, ca1, ca2); 368 369 afpChain.setBlockSize(new int[] {afpChain.getNrEQR()}); 370 afpChain.setBlockRmsd(new double[] {afpChain.getTotalRmsdOpt()}); 371 afpChain.setBlockGap(new int[] {afpChain.getGapLen()}); 372 373 return afpChain; 374 375 } 376 377 /** 378 * Takes a protein sequence string with capital and lowercase letters and sets its {@link ProteinSequence#getUserCollection() user collection} to record which letters are uppercase (aligned) and which are lowercase (unaligned). 379 * @param sequence Make sure <em>not</em> to use {@link ProteinSequence#getSequenceAsString()} for this, as it won't preserve upper- and lower-case 380 */ 381 public static List<Object> getAlignedUserCollection(String sequence) { 382 List<Object> aligned = new ArrayList<>(sequence.length()); 383 for (char c : sequence.toCharArray()) { 384 aligned.add(Character.isUpperCase(c)); 385 } 386 return aligned; 387 } 388 389 /** 390 * Prints out the XML representation of an AFPChain from a file containing exactly two FASTA sequences. 391 * 392 * @param args 393 * A String array of fasta-file structure-1-name structure-2-name 394 * @throws StructureException 395 * @throws IOException 396 */ 397 public static void main(String[] args) throws StructureException, IOException { 398 if (args.length != 3) { 399 System.err.println("Usage: FastaAFPChainConverter fasta-file structure-1-name structure-2-name"); 400 return; 401 } 402 File fasta = new File(args[0]); 403 Structure structure1 = StructureTools.getStructure(args[1]); 404 Structure structure2 = StructureTools.getStructure(args[2]); 405 if (structure1 == null) throw new IllegalArgumentException("No structure for " + args[1] + " was found"); 406 if (structure2 == null) throw new IllegalArgumentException("No structure for " + args[2] + " was found"); 407 AFPChain afpChain = fastaFileToAfpChain(fasta, structure1, structure2); 408 String xml = AFPChainXMLConverter.toXML(afpChain); 409 System.out.println(xml); 410 } 411 412}