001/**
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 2013-05-28
021 * Created by Douglas Myers-Turnbull
022 *
023 * @since 3.0.6
024 */
025package org.biojava.nbio.structure.io;
026
027import org.biojava.nbio.core.alignment.template.AlignedSequence;
028import org.biojava.nbio.core.alignment.template.SequencePair;
029import org.biojava.nbio.structure.*;
030import org.biojava.nbio.structure.align.model.AFPChain;
031import org.biojava.nbio.structure.align.util.AlignmentTools;
032import org.biojava.nbio.structure.align.xml.AFPChainXMLConverter;
033import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
034import org.biojava.nbio.core.sequence.ProteinSequence;
035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
037import org.biojava.nbio.core.sequence.io.CasePreservingProteinSequenceCreator;
038import org.biojava.nbio.core.sequence.io.FastaReader;
039import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser;
040import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
041import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
042import org.biojava.nbio.core.sequence.template.Sequence;
043import org.biojava.nbio.core.util.SequenceTools;
044import org.slf4j.Logger;
045import org.slf4j.LoggerFactory;
046
047import java.io.File;
048import java.io.FileInputStream;
049import java.io.IOException;
050import java.io.InputStream;
051import java.util.*;
052
053/**
054 * A collection of static utilities to convert between {@link AFPChain AFPChains} and FastaSequences.
055 *
056 * @author dmyersturnbull
057 * @see StructureSequenceMatcher
058 * @see FastaStructureParser
059 * @see SeqRes2AtomAligner
060 */
061public class FastaAFPChainConverter {
062
063        private final static Logger logger = LoggerFactory.getLogger(FastaAFPChainConverter.class);
064
065
066        public static AFPChain cpFastaToAfpChain(String first, String second, Structure structure, int cpSite) throws StructureException, CompoundNotFoundException {
067                ProteinSequence s1 = new ProteinSequence(first);
068                s1.setUserCollection(getAlignedUserCollection(first));
069                ProteinSequence s2 = new ProteinSequence(second);
070                s2.setUserCollection(getAlignedUserCollection(second));
071                return cpFastaToAfpChain(s1, s2, structure, cpSite);
072        }
073
074        /**
075         * Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site
076         * <code>cpSite</code> residues to the right.
077         *
078         * @param fastaFile A FASTA file containing exactly 2 sequences, the first unpermuted and the second permuted
079         * @param cpSite
080         *            The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence
081         *            are acceptable
082         * @throws IOException
083         * @throws StructureException
084         */
085        public static AFPChain cpFastaToAfpChain(File fastaFile, Structure structure, int cpSite) throws IOException, StructureException {
086                InputStream inStream = new FileInputStream(fastaFile);
087                SequenceCreatorInterface<AminoAcidCompound> creator = new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet());
088                SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser = new GenericFastaHeaderParser<>();
089                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<>(inStream, headerParser, creator);
090                Map<String, ProteinSequence> sequences = fastaReader.process();
091                inStream.close();
092                Iterator<ProteinSequence> iter = sequences.values().iterator();
093                ProteinSequence first = iter.next();
094                ProteinSequence second = iter.next();
095                return cpFastaToAfpChain(first, second, structure, cpSite);
096        }
097
098        /**
099         * Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site
100         * <code>cpSite</code> residues to the right.
101         *
102         * @param first The unpermuted sequence
103         * @param second The sequence permuted by cpSite
104         * @param cpSite
105         *            The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence
106         *            are acceptable
107         * @throws StructureException
108         */
109        public static AFPChain cpFastaToAfpChain(ProteinSequence first, ProteinSequence second, Structure structure, int cpSite)
110                        throws StructureException {
111
112                if (structure == null) {
113                        throw new IllegalArgumentException("The structure is null");
114                }
115
116                if (first == null) {
117                        throw new IllegalArgumentException("The sequence is null");
118                }
119
120                // we need to find the ungapped CP site
121                int gappedCpShift = 0;
122                int ungappedCpShift = 0;
123                while (ungappedCpShift < Math.abs(cpSite)) {
124                        char c;
125                        try {
126                                if (cpSite <= 0) {
127                                        c = second.getSequenceAsString().charAt(gappedCpShift);
128                                } else {
129                                        c = second.getSequenceAsString().charAt(first.getLength()-1 - gappedCpShift);
130                                }
131                        } catch (StringIndexOutOfBoundsException e) {
132                                throw new IllegalArgumentException("CP site of " + cpSite + " is wrong");
133                        }
134                        if (c != '-') {
135                                ungappedCpShift++;
136                        }
137                        gappedCpShift++;
138                }
139
140                Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure);
141                Atom[] ca2 =  StructureTools.getRepresentativeAtomArray(structure); // can't use cloneCAArray because it doesn't set parent group.chain.structure
142
143                ProteinSequence antipermuted = null;
144                try {
145                        antipermuted = new ProteinSequence(SequenceTools.permuteCyclic(second.getSequenceAsString(), gappedCpShift));
146                } catch (CompoundNotFoundException e) {
147                        // this can't happen, the original sequence comes from a ProteinSequence
148                        logger.error("Unexpected error while creating protein sequence: {}. This is most likely a bug.",e.getMessage() );
149                }
150
151                ResidueNumber[] residues = StructureSequenceMatcher.matchSequenceToStructure(first, structure);
152                ResidueNumber[] antipermutedResidues = StructureSequenceMatcher.matchSequenceToStructure(antipermuted, structure);
153
154                ResidueNumber[] nonpermutedResidues = new ResidueNumber[antipermutedResidues.length];
155                SequenceTools.permuteCyclic(antipermutedResidues, nonpermutedResidues, -gappedCpShift);
156
157                // nullify ResidueNumbers that have a lowercase sequence character
158                if (first.getUserCollection() != null) {
159                        CasePreservingProteinSequenceCreator.setLowercaseToNull(first, residues);
160                }
161                if (second.getUserCollection() != null) {
162                        CasePreservingProteinSequenceCreator.setLowercaseToNull(second, nonpermutedResidues);
163                }
164
165//              for (int i = 0; i < residues.length; i++) {
166//                      if (residues[i] == null) {
167//                              System.out.print("=");
168//                      } else {
169//                              System.out.print(sequence.getSequenceAsString().charAt(i));
170//                      }
171//              }
172//              System.out.println();
173//              for (int i = 0; i < residues.length; i++) {
174//                      if (nonpermutedResidues[i] == null) {
175//                              System.out.print("=");
176//                      } else {
177//                              System.out.print(second.getSequenceAsString().charAt(i));
178//                      }
179//              }
180//              System.out.println();
181
182                return buildAlignment(ca1, ca2, residues, nonpermutedResidues);
183
184        }
185
186        /**
187         * Reads the file {@code fastaFile}, expecting exactly two sequences which give a pairwise alignment. Uses this and two structures to create an AFPChain corresponding to the alignment. Uses a
188         * {@link CasePreservingProteinSequenceCreator} and assumes that a residue is aligned if and only if it is given by an uppercase letter.
189         *
190         * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
191         * @throws IOException
192         * @throws StructureException
193         */
194        public static AFPChain fastaFileToAfpChain(File fastaFile, Structure structure1, Structure structure2)
195                        throws IOException, StructureException {
196                InputStream inStream = new FileInputStream(fastaFile);
197                SequenceCreatorInterface<AminoAcidCompound> creator = new CasePreservingProteinSequenceCreator(
198                                AminoAcidCompoundSet.getAminoAcidCompoundSet());
199                SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser = new GenericFastaHeaderParser<>();
200                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<>(
201                                inStream, headerParser, creator);
202                Map<String, ProteinSequence> sequences = fastaReader.process();
203                inStream.close();
204                return fastaToAfpChain(sequences, structure1, structure2);
205        }
206
207        /**
208         * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The
209         * sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures.
210         * @throws StructureException
211         * @throws CompoundNotFoundException
212         */
213        public static AFPChain fastaStringToAfpChain(String sequence1, String sequence2, Structure structure1,
214                        Structure structure2) throws StructureException, CompoundNotFoundException {
215                ProteinSequence seq1 = new ProteinSequence(sequence1);
216                ProteinSequence seq2 = new ProteinSequence(sequence2);
217                return fastaToAfpChain(seq1, seq2, structure1, structure2);
218        }
219
220        /**
221         * Uses two sequences each with a corresponding structure to create an AFPChain corresponding to the alignment. Provided only for convenience since FastaReaders return such maps.
222         *
223         * @param sequences
224         *            A Map containing exactly two entries from sequence names as Strings to gapped ProteinSequences; the name is ignored
225         * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
226         * @throws StructureException
227         */
228        public static AFPChain fastaToAfpChain(Map<String, ProteinSequence> sequences, Structure structure1,
229                        Structure structure2) throws StructureException {
230
231                if (sequences.size() != 2) {
232                        throw new IllegalArgumentException("There must be exactly 2 sequences, but there were " + sequences.size());
233                }
234
235                if (structure1 == null || structure2 == null) {
236                        throw new IllegalArgumentException("A structure is null");
237                }
238
239                List<ProteinSequence> seqs = new ArrayList<>();
240                List<String> names = new ArrayList<>(2);
241                for (Map.Entry<String, ProteinSequence> entry : sequences.entrySet()) {
242                        seqs.add(entry.getValue());
243                        names.add(entry.getKey());
244                }
245
246                return fastaToAfpChain(seqs.get(0), seqs.get(1), structure1, structure2);
247        }
248
249        /**
250         * TODO Write comment
251         * @param sequence1
252         * @param sequence2
253         * @param structure1
254         * @param structure2
255         * @return
256         * @throws StructureException
257         * @throws CompoundNotFoundException
258         */
259        public static AFPChain fastaToAfpChain(String sequence1, String sequence2, Structure structure1,
260                        Structure structure2) throws StructureException, CompoundNotFoundException {
261                ProteinSequence s1 = new ProteinSequence(sequence1);
262                s1.setUserCollection(getAlignedUserCollection(sequence1));
263                ProteinSequence s2 = new ProteinSequence(sequence2);
264                s2.setUserCollection(getAlignedUserCollection(sequence2));
265                return fastaToAfpChain(s1, s2, structure1, structure2);
266        }
267
268        /**
269         * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The
270         * sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures. Assumes that a residue is aligned if and only if it is given by
271         * an uppercase letter.
272         * @param sequence1 <em>Must</em> have {@link ProteinSequence#getUserCollection()} set to document upper- and lower-case as aligned and unaligned; see {@link #getAlignedUserCollection(String)}
273         * @throws StructureException
274         */
275        public static AFPChain fastaToAfpChain(ProteinSequence sequence1, ProteinSequence sequence2, Structure structure1,
276                        Structure structure2) throws StructureException {
277
278                if (structure1 == null || structure2 == null) {
279                        throw new IllegalArgumentException("A structure is null");
280                }
281
282                if (sequence1 == null || sequence2 == null) {
283                        throw new IllegalArgumentException("A sequence is null");
284                }
285
286                Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure1);
287                Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure2);
288
289                ResidueNumber[] residues1 = StructureSequenceMatcher.matchSequenceToStructure(sequence1, structure1);
290                ResidueNumber[] residues2 = StructureSequenceMatcher.matchSequenceToStructure(sequence2, structure2);
291
292                // nullify ResidueNumbers that have a lowercase sequence character
293                if (sequence1.getUserCollection() != null) {
294                        CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence1, residues1);
295                }
296                if (sequence2.getUserCollection() != null) {
297                        CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence2, residues2);
298                }
299
300                return buildAlignment(ca1, ca2, residues1, residues2);
301
302        }
303
304        /**
305         * Provided only for convenience.
306         *
307         * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
308         * @throws StructureException
309         */
310        public static AFPChain fastaToAfpChain(SequencePair<Sequence<AminoAcidCompound>, AminoAcidCompound> alignment,
311                        Structure structure1, Structure structure2) throws StructureException {
312                List<AlignedSequence<Sequence<AminoAcidCompound>, AminoAcidCompound>> seqs = alignment.getAlignedSequences();
313                StringBuilder sb1 = new StringBuilder();
314                for (AminoAcidCompound a : seqs.get(0)) {
315                        sb1.append(a.getBase());
316                }
317                try {
318                        ProteinSequence seq1 = new ProteinSequence(sb1.toString());
319                        StringBuilder sb2 = new StringBuilder();
320                        for (AminoAcidCompound a : seqs.get(1)) {
321                                sb1.append(a.getBase());
322                        }
323                        ProteinSequence seq2 = new ProteinSequence(sb2.toString());
324                        LinkedHashMap<String, ProteinSequence> map = new LinkedHashMap<>();
325                        map.put(structure1.getName(), seq1);
326                        map.put(structure2.getName(), seq2);
327                        return fastaToAfpChain(map, structure1, structure2);
328                } catch (CompoundNotFoundException e) {
329                        logger.error("Unexpected error while creating protein sequences: {}. This is most likely a bug.",e.getMessage());
330                        return null;
331                }
332        }
333
334        /**
335         * Builds an {@link AFPChain} from already-matched arrays of atoms and residues.
336         *
337         * @param ca1
338         *            An array of atoms in the first structure
339         * @param ca2
340         *            An array of atoms in the second structure
341         * @param residues1
342         *            An array of {@link ResidueNumber ResidueNumbers} in the first structure that are aligned. Only null ResidueNumbers are considered to be unaligned
343         * @param residues2
344         *            An array of {@link ResidueNumber ResidueNumbers} in the second structure that are aligned. Only null ResidueNumbers are considered to be unaligned
345         * @throws StructureException
346         */
347        private static AFPChain buildAlignment(Atom[] ca1, Atom[] ca2, ResidueNumber[] residues1, ResidueNumber[] residues2)
348                        throws StructureException {
349
350                // remove any gap
351                // this includes the ones introduced by the nullifying above
352                List<ResidueNumber> alignedResiduesList1 = new ArrayList<>();
353                List<ResidueNumber> alignedResiduesList2 = new ArrayList<>();
354                for (int i = 0; i < residues1.length; i++) {
355                        if (residues1[i] != null && residues2[i] != null) {
356                                alignedResiduesList1.add(residues1[i]);
357                                alignedResiduesList2.add(residues2[i]);
358                        }
359                }
360
361                ResidueNumber[] alignedResidues1 = alignedResiduesList1.toArray(new ResidueNumber[alignedResiduesList1.size()]);
362                ResidueNumber[] alignedResidues2 = alignedResiduesList2.toArray(new ResidueNumber[alignedResiduesList2.size()]);
363
364                AFPChain afpChain = AlignmentTools.createAFPChain(ca1, ca2, alignedResidues1, alignedResidues2);
365                afpChain.setAlgorithmName("unknown");
366
367                AlignmentTools.updateSuperposition(afpChain, ca1, ca2);
368
369                afpChain.setBlockSize(new int[] {afpChain.getNrEQR()});
370                afpChain.setBlockRmsd(new double[] {afpChain.getTotalRmsdOpt()});
371                afpChain.setBlockGap(new int[] {afpChain.getGapLen()});
372
373                return afpChain;
374
375        }
376
377        /**
378         * Takes a protein sequence string with capital and lowercase letters and sets its {@link ProteinSequence#getUserCollection() user collection} to record which letters are uppercase (aligned) and which are lowercase (unaligned).
379         * @param sequence Make sure <em>not</em> to use {@link ProteinSequence#getSequenceAsString()} for this, as it won't preserve upper- and lower-case
380         */
381        public static List<Object> getAlignedUserCollection(String sequence) {
382                List<Object> aligned = new ArrayList<>(sequence.length());
383                for (char c : sequence.toCharArray()) {
384                        aligned.add(Character.isUpperCase(c));
385                }
386                return aligned;
387        }
388
389        /**
390         * Prints out the XML representation of an AFPChain from a file containing exactly two FASTA sequences.
391         *
392         * @param args
393         *            A String array of fasta-file structure-1-name structure-2-name
394         * @throws StructureException
395         * @throws IOException
396         */
397        public static void main(String[] args) throws StructureException, IOException {
398                if (args.length != 3) {
399                        System.err.println("Usage: FastaAFPChainConverter fasta-file structure-1-name structure-2-name");
400                        return;
401                }
402                File fasta = new File(args[0]);
403                Structure structure1 = StructureTools.getStructure(args[1]);
404                Structure structure2 = StructureTools.getStructure(args[2]);
405                if (structure1 == null) throw new IllegalArgumentException("No structure for " + args[1] + " was found");
406                if (structure2 == null) throw new IllegalArgumentException("No structure for " + args[2] + " was found");
407                AFPChain afpChain = fastaFileToAfpChain(fasta, structure1, structure2);
408                String xml = AFPChainXMLConverter.toXML(afpChain);
409                System.out.println(xml);
410        }
411
412}