001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on June 9, 2010
021 * Author: Mark Chapman
022 */
023
024package org.biojava.nbio.core.alignment.matrices;
025
026import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
029import org.biojava.nbio.core.sequence.template.Compound;
030import org.biojava.nbio.core.sequence.template.CompoundSet;
031
032import java.io.*;
033import java.util.*;
034
035/**
036 * Implements a data structure which holds the score (penalty or bonus) given during alignment for the exchange of one
037 * {@link Compound} in a sequence for another.
038 *
039 * @author Mark Chapman
040 * @author Daniel Cameron
041 * @author Paolo Pavan
042 * @param <C> each element of the matrix corresponds to a pair of {@link Compound}s of type C
043 */
044public class SimpleSubstitutionMatrix<C extends Compound> implements SubstitutionMatrix<C>, Serializable {
045
046        /**
047         *
048         */
049        private static final long serialVersionUID = -2645265638108462479L;
050
051        private static final String comment = "#";
052
053        private CompoundSet<C> compoundSet;
054        private String description, name;
055        private short[][] matrix;
056        private short max, min;
057        private List<C> rows, cols;
058
059        public static SubstitutionMatrix<AminoAcidCompound> getBlosum62() {
060                return new SimpleSubstitutionMatrix<>(AminoAcidCompoundSet.getAminoAcidCompoundSet(), new InputStreamReader(
061                                SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum62.txt")), "blosum62");
062        }
063
064        /**
065         * Creates a substitution matrix by reading in a file.
066         *
067         * @param compoundSet the {@link CompoundSet} on which the matrix is defined
068         * @param fileInput file parsed for a substitution matrix
069         * @throws FileNotFoundException if fileInput parameter cannot be read
070         */
071        public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, File fileInput) throws FileNotFoundException {
072                this(compoundSet, new BufferedReader(new FileReader(fileInput)), fileInput.getName());
073        }
074
075        /**
076         * Creates a substitution matrix by parsing some input.
077         *
078         * @param compoundSet the {@link CompoundSet} on which the matrix is defined
079         * @param input input parsed for a substitution matrix
080         * @param name the name (short description) of this matrix
081         */
082        public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, Reader input, String name) {
083                this(compoundSet, new Scanner(input), name);
084        }
085
086        /**
087         * Creates a substitution matrix by parsing a String.
088         *
089         * @param compoundSet the {@link CompoundSet} on which the matrix is defined
090         * @param matrixInput String parsed for a substitution matrix
091         * @param name the name (short description) of this matrix
092         */
093        public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, String matrixInput, String name) {
094                this(compoundSet, new Scanner(matrixInput), name);
095        }
096
097        /**
098         * Creates an identity substitution matrix from match and replace values.
099         *
100         * @param compoundSet the {@link CompoundSet} on which the matrix is defined
101         * @param match matrix value used for equivalent {@link Compound}s
102         * @param replace matrix value used for differing {@link Compound}s
103         */
104        public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, short match, short replace) {
105                this.compoundSet = compoundSet;
106                description = "Identity matrix. All replaces and all matches are treated equally.";
107                name = "IDENTITY_" + match + "_" + replace;
108                max = (match > replace) ? match : replace;
109                min = (match < replace) ? match : replace;
110                rows = cols = compoundSet.getAllCompounds();
111                matrix = new short[rows.size()][cols.size()];
112                for (int r = 0; r < rows.size(); r++) {
113                        for (int c = 0; c < cols.size(); c++) {
114                                try {
115                                        matrix[r][c] = (compoundSet.compoundsEquivalent(rows.get(r), cols.get(c))) ? match : replace;
116                                } catch (UnsupportedOperationException e) {
117                                        matrix[r][c] = (r == c) ? match : replace;
118                                }
119                        }
120                }
121        }
122
123        // helper constructor that creates a substitution matrix by parsing input
124        private SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, Scanner input, String name) {
125                this.compoundSet = compoundSet;
126                this.name = name;
127                max = Short.MIN_VALUE;
128                min = Short.MAX_VALUE;
129                rows = new ArrayList<>();
130                cols = new ArrayList<>();
131                StringBuilder descriptionIn = new StringBuilder();
132                List<short[]> matrixIn = new ArrayList<>();
133                while(input.hasNextLine()) {
134                        String line = input.nextLine();
135                        if (line.startsWith(comment)) {
136                                descriptionIn.append(String.format("%s%n", line));
137                        } else if (!line.trim().isEmpty()) {
138                                StringTokenizer st = new StringTokenizer(line);
139                                if (cols.isEmpty()) {
140                                        while (st.hasMoreTokens()) {
141                                                cols.add(compoundSet.getCompoundForString(st.nextToken()));
142                                        }
143                                } else {
144                                        rows.add(compoundSet.getCompoundForString(st.nextToken()));
145                                        short[] row = new short[cols.size()];
146                                        for (int i = 0; i < row.length && st.hasMoreTokens(); i++) {
147                                                row[i] = Short.parseShort(st.nextToken());
148                                                max = (max > row[i]) ? max : row[i];
149                                                min = (min < row[i]) ? min : row[i];
150                                        }
151                                        matrixIn.add(row);
152                                }
153                        }
154                }
155                input.close();
156                description = descriptionIn.toString();
157                matrix = new short[rows.size()][cols.size()];
158                for (int i = 0; i < rows.size(); i++) {
159                        matrix[i] = matrixIn.get(i);
160                }
161        }
162
163        @Override
164        public CompoundSet<C> getCompoundSet() {
165                return compoundSet;
166        }
167
168        @Override
169        public String getDescription() {
170                return description;
171        }
172
173        @Override
174        public short[][] getMatrix() {
175                short[][] copy = new short[matrix.length][matrix[0].length];
176                for (int i = 0; i < copy.length; i++) {
177                        copy[i] = Arrays.copyOf(matrix[i], matrix[i].length);
178                }
179                return copy;
180        }
181
182        @Override
183        public String getMatrixAsString() {
184                StringBuilder s = new StringBuilder();
185                int lengthCompound = compoundSet.getMaxSingleCompoundStringLength(), lengthRest =
186                                Math.max(Math.max(Short.toString(min).length(), Short.toString(max).length()), lengthCompound) + 1;
187                String padCompound = "%" + Integer.toString(lengthCompound) + "s",
188                                padRest = "%" + Integer.toString(lengthRest);
189                for (int i = 0; i < lengthCompound; i++) {
190                        s.append(" ");
191                }
192                for (C col : cols) {
193                        s.append(String.format(padRest + "s", compoundSet.getStringForCompound(col)));
194                }
195                s.append(String.format("%n"));
196                for (C row : rows) {
197                        s.append(String.format(padCompound, compoundSet.getStringForCompound(row)));
198                        for (C col : cols) {
199                                s.append(String.format(padRest + "d", getValue(row, col)));
200                        }
201                        s.append(String.format("%n"));
202                }
203                return s.toString();
204        }
205
206        @Override
207        public short getMaxValue() {
208                return max;
209        }
210
211        @Override
212        public short getMinValue() {
213                return min;
214        }
215
216        @Override
217        public String getName() {
218                return name;
219        }
220        /**
221         * Returns the index of the first occurrence of the specified element in the list.
222         * If the list does not contain the given compound, the index of the first occurrence
223         * of the element according to case-insensitive equality.
224         * If no such elements exist, -1 is returned.
225         * @param list list of compounds to search
226         * @param compound compound to search for
227         * @return Returns the index of the first match to the specified element in this list, or -1 if there is no such index.
228         */
229        private static <C extends Compound> int getIndexOfCompound(List<C> list, C compound) {
230                int index = list.indexOf(compound);
231                if (index == -1) {
232                        for (int i = 0; i < list.size(); i++) {
233                                if (compound.equalsIgnoreCase(list.get(i))) {
234                                        index = i;
235                                        break;
236                                }
237                        }
238                }
239                return index;
240        }
241        @Override
242        public short getValue(C from, C to) {
243                int row = getIndexOfCompound(rows, from), col = getIndexOfCompound(cols, to);
244                if (row == -1 || col == -1) {
245                        row = getIndexOfCompound(cols, from);
246                        col = getIndexOfCompound(rows, to);
247                        if (row == -1 || col == -1) {
248                                return min;
249                        }
250                }
251                return matrix[row][col];
252        }
253
254        @Override
255        public SubstitutionMatrix<C> normalizeMatrix(short scale) {
256                // TODO SubstitutionMatrix<C> normalizeMatrix(short)
257                return null;
258        }
259
260        @Override
261        public void setDescription(String description) {
262                this.description = description;
263        }
264
265        @Override
266        public void setName(String name) {
267                this.name = name;
268        }
269
270        /**
271         * Returns in a format similar to the standard NCBI files.
272         */
273        @Override
274        public String toString() {
275                StringBuilder s = new StringBuilder();
276                StringTokenizer st = new StringTokenizer(description, "\n\r");
277                while (st.hasMoreTokens()) {
278                        String line = st.nextToken();
279                        if (!line.startsWith(comment)) {
280                                s.append(comment);
281                        }
282                        s.append(String.format("%s%n", line));
283                }
284                s.append(getMatrixAsString());
285                return s.toString();
286        }
287
288        @Override
289        public Map<C, Short> getRow(C row) {
290                int rowIndex = rows.indexOf(row);
291                Map<C, Short> map = new HashMap<>();
292                for (int colIndex = 0; colIndex < matrix[rowIndex].length; colIndex++) {
293                        map.put(cols.get(colIndex), matrix[rowIndex][colIndex]);
294                }
295                return map;
296        }
297
298        @Override
299        public Map<C, Short> getColumn(C column) {
300                int colIndex = cols.indexOf(column);
301                Map<C, Short> map = new HashMap<>();
302                for (int i = 0; i < matrix.length; i++) {
303                        map.put(rows.get(i), matrix[i][colIndex]);
304                }
305                return map;
306        }
307
308}