001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on June 9, 2010 021 * Author: Mark Chapman 022 */ 023 024package org.biojava.nbio.core.alignment.matrices; 025 026import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 029import org.biojava.nbio.core.sequence.template.Compound; 030import org.biojava.nbio.core.sequence.template.CompoundSet; 031 032import java.io.*; 033import java.util.*; 034 035/** 036 * Implements a data structure which holds the score (penalty or bonus) given during alignment for the exchange of one 037 * {@link Compound} in a sequence for another. 038 * 039 * @author Mark Chapman 040 * @author Daniel Cameron 041 * @author Paolo Pavan 042 * @param <C> each element of the matrix corresponds to a pair of {@link Compound}s of type C 043 */ 044public class SimpleSubstitutionMatrix<C extends Compound> implements SubstitutionMatrix<C>, Serializable { 045 046 /** 047 * 048 */ 049 private static final long serialVersionUID = -2645265638108462479L; 050 051 private static final String comment = "#"; 052 053 private CompoundSet<C> compoundSet; 054 private String description, name; 055 private short[][] matrix; 056 private short max, min; 057 private List<C> rows, cols; 058 059 public static SubstitutionMatrix<AminoAcidCompound> getBlosum62() { 060 return new SimpleSubstitutionMatrix<>(AminoAcidCompoundSet.getAminoAcidCompoundSet(), new InputStreamReader( 061 SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum62.txt")), "blosum62"); 062 } 063 064 /** 065 * Creates a substitution matrix by reading in a file. 066 * 067 * @param compoundSet the {@link CompoundSet} on which the matrix is defined 068 * @param fileInput file parsed for a substitution matrix 069 * @throws FileNotFoundException if fileInput parameter cannot be read 070 */ 071 public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, File fileInput) throws FileNotFoundException { 072 this(compoundSet, new BufferedReader(new FileReader(fileInput)), fileInput.getName()); 073 } 074 075 /** 076 * Creates a substitution matrix by parsing some input. 077 * 078 * @param compoundSet the {@link CompoundSet} on which the matrix is defined 079 * @param input input parsed for a substitution matrix 080 * @param name the name (short description) of this matrix 081 */ 082 public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, Reader input, String name) { 083 this(compoundSet, new Scanner(input), name); 084 } 085 086 /** 087 * Creates a substitution matrix by parsing a String. 088 * 089 * @param compoundSet the {@link CompoundSet} on which the matrix is defined 090 * @param matrixInput String parsed for a substitution matrix 091 * @param name the name (short description) of this matrix 092 */ 093 public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, String matrixInput, String name) { 094 this(compoundSet, new Scanner(matrixInput), name); 095 } 096 097 /** 098 * Creates an identity substitution matrix from match and replace values. 099 * 100 * @param compoundSet the {@link CompoundSet} on which the matrix is defined 101 * @param match matrix value used for equivalent {@link Compound}s 102 * @param replace matrix value used for differing {@link Compound}s 103 */ 104 public SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, short match, short replace) { 105 this.compoundSet = compoundSet; 106 description = "Identity matrix. All replaces and all matches are treated equally."; 107 name = "IDENTITY_" + match + "_" + replace; 108 max = (match > replace) ? match : replace; 109 min = (match < replace) ? match : replace; 110 rows = cols = compoundSet.getAllCompounds(); 111 matrix = new short[rows.size()][cols.size()]; 112 for (int r = 0; r < rows.size(); r++) { 113 for (int c = 0; c < cols.size(); c++) { 114 try { 115 matrix[r][c] = (compoundSet.compoundsEquivalent(rows.get(r), cols.get(c))) ? match : replace; 116 } catch (UnsupportedOperationException e) { 117 matrix[r][c] = (r == c) ? match : replace; 118 } 119 } 120 } 121 } 122 123 // helper constructor that creates a substitution matrix by parsing input 124 private SimpleSubstitutionMatrix(CompoundSet<C> compoundSet, Scanner input, String name) { 125 this.compoundSet = compoundSet; 126 this.name = name; 127 max = Short.MIN_VALUE; 128 min = Short.MAX_VALUE; 129 rows = new ArrayList<>(); 130 cols = new ArrayList<>(); 131 StringBuilder descriptionIn = new StringBuilder(); 132 List<short[]> matrixIn = new ArrayList<>(); 133 while(input.hasNextLine()) { 134 String line = input.nextLine(); 135 if (line.startsWith(comment)) { 136 descriptionIn.append(String.format("%s%n", line)); 137 } else if (!line.trim().isEmpty()) { 138 StringTokenizer st = new StringTokenizer(line); 139 if (cols.isEmpty()) { 140 while (st.hasMoreTokens()) { 141 cols.add(compoundSet.getCompoundForString(st.nextToken())); 142 } 143 } else { 144 rows.add(compoundSet.getCompoundForString(st.nextToken())); 145 short[] row = new short[cols.size()]; 146 for (int i = 0; i < row.length && st.hasMoreTokens(); i++) { 147 row[i] = Short.parseShort(st.nextToken()); 148 max = (max > row[i]) ? max : row[i]; 149 min = (min < row[i]) ? min : row[i]; 150 } 151 matrixIn.add(row); 152 } 153 } 154 } 155 input.close(); 156 description = descriptionIn.toString(); 157 matrix = new short[rows.size()][cols.size()]; 158 for (int i = 0; i < rows.size(); i++) { 159 matrix[i] = matrixIn.get(i); 160 } 161 } 162 163 @Override 164 public CompoundSet<C> getCompoundSet() { 165 return compoundSet; 166 } 167 168 @Override 169 public String getDescription() { 170 return description; 171 } 172 173 @Override 174 public short[][] getMatrix() { 175 short[][] copy = new short[matrix.length][matrix[0].length]; 176 for (int i = 0; i < copy.length; i++) { 177 copy[i] = Arrays.copyOf(matrix[i], matrix[i].length); 178 } 179 return copy; 180 } 181 182 @Override 183 public String getMatrixAsString() { 184 StringBuilder s = new StringBuilder(); 185 int lengthCompound = compoundSet.getMaxSingleCompoundStringLength(), lengthRest = 186 Math.max(Math.max(Short.toString(min).length(), Short.toString(max).length()), lengthCompound) + 1; 187 String padCompound = "%" + Integer.toString(lengthCompound) + "s", 188 padRest = "%" + Integer.toString(lengthRest); 189 for (int i = 0; i < lengthCompound; i++) { 190 s.append(" "); 191 } 192 for (C col : cols) { 193 s.append(String.format(padRest + "s", compoundSet.getStringForCompound(col))); 194 } 195 s.append(String.format("%n")); 196 for (C row : rows) { 197 s.append(String.format(padCompound, compoundSet.getStringForCompound(row))); 198 for (C col : cols) { 199 s.append(String.format(padRest + "d", getValue(row, col))); 200 } 201 s.append(String.format("%n")); 202 } 203 return s.toString(); 204 } 205 206 @Override 207 public short getMaxValue() { 208 return max; 209 } 210 211 @Override 212 public short getMinValue() { 213 return min; 214 } 215 216 @Override 217 public String getName() { 218 return name; 219 } 220 /** 221 * Returns the index of the first occurrence of the specified element in the list. 222 * If the list does not contain the given compound, the index of the first occurrence 223 * of the element according to case-insensitive equality. 224 * If no such elements exist, -1 is returned. 225 * @param list list of compounds to search 226 * @param compound compound to search for 227 * @return Returns the index of the first match to the specified element in this list, or -1 if there is no such index. 228 */ 229 private static <C extends Compound> int getIndexOfCompound(List<C> list, C compound) { 230 int index = list.indexOf(compound); 231 if (index == -1) { 232 for (int i = 0; i < list.size(); i++) { 233 if (compound.equalsIgnoreCase(list.get(i))) { 234 index = i; 235 break; 236 } 237 } 238 } 239 return index; 240 } 241 @Override 242 public short getValue(C from, C to) { 243 int row = getIndexOfCompound(rows, from), col = getIndexOfCompound(cols, to); 244 if (row == -1 || col == -1) { 245 row = getIndexOfCompound(cols, from); 246 col = getIndexOfCompound(rows, to); 247 if (row == -1 || col == -1) { 248 return min; 249 } 250 } 251 return matrix[row][col]; 252 } 253 254 @Override 255 public SubstitutionMatrix<C> normalizeMatrix(short scale) { 256 // TODO SubstitutionMatrix<C> normalizeMatrix(short) 257 return null; 258 } 259 260 @Override 261 public void setDescription(String description) { 262 this.description = description; 263 } 264 265 @Override 266 public void setName(String name) { 267 this.name = name; 268 } 269 270 /** 271 * Returns in a format similar to the standard NCBI files. 272 */ 273 @Override 274 public String toString() { 275 StringBuilder s = new StringBuilder(); 276 StringTokenizer st = new StringTokenizer(description, "\n\r"); 277 while (st.hasMoreTokens()) { 278 String line = st.nextToken(); 279 if (!line.startsWith(comment)) { 280 s.append(comment); 281 } 282 s.append(String.format("%s%n", line)); 283 } 284 s.append(getMatrixAsString()); 285 return s.toString(); 286 } 287 288 @Override 289 public Map<C, Short> getRow(C row) { 290 int rowIndex = rows.indexOf(row); 291 Map<C, Short> map = new HashMap<>(); 292 for (int colIndex = 0; colIndex < matrix[rowIndex].length; colIndex++) { 293 map.put(cols.get(colIndex), matrix[rowIndex][colIndex]); 294 } 295 return map; 296 } 297 298 @Override 299 public Map<C, Short> getColumn(C column) { 300 int colIndex = cols.indexOf(column); 301 Map<C, Short> map = new HashMap<>(); 302 for (int i = 0; i < matrix.length; i++) { 303 map.put(rows.get(i), matrix[i][colIndex]); 304 } 305 return map; 306 } 307 308}