001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.cluster;
022
023import org.biojava.nbio.structure.*;
024import org.slf4j.Logger;
025import org.slf4j.LoggerFactory;
026
027import java.util.ArrayList;
028import java.util.Collections;
029import java.util.List;
030
031/**
032 * The SubunitExtractor extracts the information of each protein {@link Chain}
033 * in a {@link Structure} and converts them into a List of {@link Subunit}.
034 *
035 * @author Peter Rose
036 * @author Aleix Lafita
037 * @since 5.0.0
038 *
039 */
040public class SubunitExtractor {
041
042        private static final Logger logger = LoggerFactory
043                        .getLogger(SubunitExtractor.class);
044
045        /** Prevent instantiation **/
046        private SubunitExtractor() {
047        }
048
049        /**
050         * Extract the information of each protein Chain in a Structure and converts
051         * them into a List of Subunit. The name of the Subunits is set to
052         * {@link Chain#getId()}.
053         *
054         *
055         * @param structure
056         *            Structure object with protein Chains
057         * @param absMinLen
058         *            {@link SubunitClustererParameters#getAbsoluteMinimumSequenceLength()}
059         * @param fraction
060         *            {@link SubunitClustererParameters#getMinimumSequenceLengthFraction()}
061         * @param minLen
062         *            {@link SubunitClustererParameters#getMinimumSequenceLength()}
063         * @return List of Subunits
064         */
065        public static List<Subunit> extractSubunits(Structure structure,
066                        int absMinLen, double fraction, int minLen) {
067
068                // The extracted subunit container
069                List<Subunit> subunits = new ArrayList<Subunit>();
070
071                for (Chain c : structure.getPolyChains()) {
072                        // Only take protein chains
073                        if (c.isProtein()) {
074                                Atom[] ca = StructureTools.getRepresentativeAtomArray(c);
075                                logger.debug("Chain " + c.getId() + "; CA Atoms: " + ca.length + "; SEQRES: " + c.getSeqResSequence());
076                                if (ca.length==0)
077                                        continue;
078                                subunits.add(new Subunit(ca, c.getId(), null, structure));
079                        }
080                }
081
082                // Calculate the minimum length of a Subunit
083                int adjustedMinLen = calcAdjustedMinimumSequenceLength(subunits,
084                                absMinLen, fraction, minLen);
085                logger.debug("Adjusted minimum sequence length: " + adjustedMinLen);
086
087                // Filter out short Subunits
088                for (int s = subunits.size() - 1; s >= 0; s--) {
089                        if (subunits.get(s).size() < adjustedMinLen)
090                                subunits.remove(s);
091                }
092
093                return subunits;
094        }
095
096        /**
097         * Returns an adapted minimum sequence length. This method ensure that
098         * structure that only have short chains are not excluded by the
099         * minimumSequenceLength cutoff value.
100         *
101         * @return adjustedMinimumSequenceLength
102         */
103        private static int calcAdjustedMinimumSequenceLength(
104                        List<Subunit> subunits, int absMinLen, double fraction, int minLen) {
105
106                int maxLength = Integer.MIN_VALUE;
107                int minLength = Integer.MAX_VALUE;
108
109                // Extract the length List, the min and the max
110                List<Integer> lengths = new ArrayList<Integer>();
111                for (int i = 0; i < subunits.size(); i++) {
112                        if (subunits.get(i).size() >= absMinLen) {
113                                maxLength = Math.max(subunits.get(i).size(), maxLength);
114                                minLength = Math.min(subunits.get(i).size(), minLength);
115                                lengths.add(subunits.get(i).size());
116
117                        }
118                }
119
120                int adjustedMinimumSequenceLength = minLen;
121
122                if (lengths.size() < 2)
123                        return adjustedMinimumSequenceLength;
124
125                // Calculate the median of the lengths
126                double median = 0;
127                Collections.sort(lengths);
128                if (lengths.size() % 2 == 1) {
129                        int middle = (lengths.size() - 1) / 2;
130                        median = lengths.get(middle);
131                } else {
132                        int middle2 = lengths.size() / 2;
133                        int middle1 = middle2 - 1;
134                        median = 0.5 * (lengths.get(middle1) + lengths.get(middle2));
135                }
136
137                // If the median * fraction is lower than the minLength
138                if (minLength >= median * fraction) {
139                        adjustedMinimumSequenceLength = Math.min(minLength, minLen);
140                }
141
142                return adjustedMinimumSequenceLength;
143        }
144}