001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.alignment.matrices; 022 023import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 024import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 025import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 026 027import java.io.BufferedReader; 028import java.io.IOException; 029import java.io.InputStream; 030import java.io.InputStreamReader; 031import java.util.ArrayList; 032import java.util.HashMap; 033import java.util.List; 034import java.util.Map; 035 036 037 038public class AAIndexFileParser { 039 int scale = -1; 040 041 Map<String,SubstitutionMatrix<AminoAcidCompound>> matrices; 042 043 ScaledSubstitutionMatrix currentMatrix; 044 String currentRows; 045 String currentCols; 046 int currentRowPos; 047 private short[][] matrix; 048 short max; 049 short min; 050 private List<AminoAcidCompound> rows, cols; 051 boolean inMatrix; 052 boolean symmetricMatrix ; 053 054 public AAIndexFileParser(){ 055 matrices = new HashMap<>(); 056 } 057 058 /** Parse an inputStream that points to an AAINDEX database file 059 * 060 * @param inputStream 061 * @throws IOException 062 */ 063 public void parse(InputStream inputStream) throws IOException { 064 065 currentMatrix = null; 066 currentRows = ""; 067 currentCols = ""; 068 max = Short.MIN_VALUE; 069 min = Short.MAX_VALUE; 070 inMatrix = false; 071 072 BufferedReader buf = new BufferedReader (new InputStreamReader (inputStream)); 073 String line = null; 074 line = buf.readLine(); 075 076 while ( line != null ) { 077 if ( line.startsWith("//")) { 078 finalizeMatrix(); 079 inMatrix = false; 080 081 } else if ( line.startsWith("H ")){ 082 // a new matrix! 083 newMatrix(line); 084 } else if ( line.startsWith("D ")) { 085 currentMatrix.setDescription(line.substring(2)); 086 } else if ( line.startsWith("M ")) { 087 initMatrix(line); 088 inMatrix = true; 089 } else if ( line.startsWith(" ")){ 090 if ( inMatrix) 091 processScores(line); 092 } 093 line = buf.readLine(); 094 } 095 } 096 097 // process a line such as > -0.3 1.6 0.7 0.8 -2.6 3.0< 098 private void processScores(String line) { 099 String[] values = line.trim().split(" +"); 100 101 // increment the current row we are talking about 102 currentRowPos++; 103 104 for ( int i =0 ; i < values.length ; i++){ 105 if ( values[i].endsWith(".")) { 106 values[i] = values[i] + "0"; 107 } 108 109 // special case: MEHP950101 110 if ("-".equals(values[i])) { 111 values[i] = "0"; 112 } 113 if ( scale == -1 ) { 114 scale = determineScale(values[0]); 115 } 116 117 Float score = Float.parseFloat(values[i]); 118 score = scale * score; 119 120 Short s = (short) Math.round(score); 121 matrix[currentRowPos][i] = s; 122 123 if ( values.length < cols.size() || ( symmetricMatrix)){ 124 //System.out.println(values.length + " " + cols.size() + " " + currentRowPos + " " + i + " " + line); 125 matrix[i][currentRowPos] = s; 126 symmetricMatrix = true; 127 } 128 if ( score > max) 129 max = s; 130 if ( score < min) 131 min = s; 132 } 133 } 134 135 private int determineScale(String value) { 136 String[] spl = value.split("\\."); 137 if (spl.length <= 1) 138 return 1; 139 String digits = spl[1]; 140 return (int)Math.round(Math.pow(10, digits.length())); 141 } 142 143 // process a line of type >M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV< 144 private void initMatrix(String line) { 145 String[] spl = line.split(" "); 146 147 // trim off the final , character 148 currentRows = spl[3].substring(0, spl[3].length()-1); 149 currentCols = spl[6]; 150 currentRowPos = -1; 151 152 int nrRows = currentRows.length(); 153 int nrCols = currentCols.length(); 154 155 matrix = new short[nrRows][nrCols]; 156 157 rows = new ArrayList<>(); 158 cols = new ArrayList<>(); 159 160 //System.out.println(">" + currentRows+"<"); 161 AminoAcidCompoundSet compoundSet = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 162 for ( int i = 0 ; i < currentRows.length() ; i ++){ 163 char c = currentRows.charAt(i); 164 AminoAcidCompound aa = compoundSet.getCompoundForString(String.valueOf(c)); 165 rows.add(aa); 166 } 167 168 for ( int i = 0 ; i < currentCols.length() ; i ++){ 169 char c = currentRows.charAt(i); 170 AminoAcidCompound aa = compoundSet.getCompoundForString(String.valueOf(c)); 171 cols.add(aa); 172 } 173 currentMatrix.setScale(scale); 174 } 175 176 177 private void newMatrix(String line) { 178 symmetricMatrix = false; 179 scale = -1; 180 181 currentMatrix = new ScaledSubstitutionMatrix(); 182 currentMatrix.setName(line.substring(2)); 183 184 185 //System.out.println("new Matrix " + currentMatrix.getName()); 186 } 187 188 // 189 private SubstitutionMatrix<AminoAcidCompound> finalizeMatrix() { 190 191 currentMatrix.setMatrix(matrix); 192 currentMatrix.setMax(max); 193 currentMatrix.setMin(min); 194 currentMatrix.setCols(cols); 195 currentMatrix.setRows(rows); 196 currentMatrix.setScale(scale); 197 matrices.put(currentMatrix.getName(), currentMatrix); 198 199 return currentMatrix; 200 201 } 202 203 public Map<String, SubstitutionMatrix<AminoAcidCompound>> getMatrices() { 204 return matrices; 205 } 206}