001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.io.BufferedReader; 025import java.io.FileReader; 026import java.io.OutputStream; 027import java.io.PrintStream; 028import java.util.Iterator; 029import java.util.LinkedHashMap; 030import java.util.Vector; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.biojava.bio.BioException; 035import org.biojava.bio.alignment.Alignment; 036import org.biojava.bio.alignment.SimpleAlignment; 037import org.biojava.bio.seq.DNATools; 038import org.biojava.bio.seq.ProteinTools; 039import org.biojava.bio.symbol.FiniteAlphabet; 040import org.biojava.bio.symbol.IllegalSymbolException; 041import org.biojava.bio.symbol.Symbol; 042import org.biojava.bio.symbol.SymbolList; 043 044/** 045 * @author raemig 046 * @author Thomas Down 047 * @author Keith James 048 * @author Nimesh Singh 049 * @author Mark Schreiber 050 * @author Matthew Pocock 051 * @author Bradford Powell 052 */ 053 054public class MSFAlignmentFormat implements AlignmentFormat { 055 private static final boolean DEBUGPRINT = false; 056 private static final int DNA = 1; 057 private static final int PROTEIN = 2; 058 059 public MSFAlignmentFormat() { 060 } 061 062 /** 063 * used to quick test the code 064 * 065 * @param args 066 */ 067 public static void main(String[] args) { 068 String filename; 069 if (args.length < 1) { 070 filename = "SimpleMSF.msf"; // change to your favorite 071 } else { 072 filename = args[0]; 073 } 074 try { 075 BufferedReader reader = new BufferedReader(new FileReader(filename)); 076 MSFAlignmentFormat MSFAlignmentFormat1 = new MSFAlignmentFormat(); 077 MSFAlignmentFormat1.read(reader); 078 } catch (Exception E) { 079 } 080 } 081 082 /** 083 * Reads an MSF Alignment File 084 * 085 * @param reader 086 * The file reader 087 * @return Alignment A SimpleAlignment consisting of the sequences in the 088 * file. 089 */ 090 public Alignment read(BufferedReader reader) { 091 Vector sequenceNames = new Vector(); 092 String sequenceName = null; 093 StringBuffer sequenceData[] = null; 094 int startOfData = 0; // the start of the sequence data in the line 095 int currSeqCount = 0; // which sequence data you are currently trying to 096 // get 097 try { 098 Pattern mtc = Pattern 099 .compile("(Name:|NAME:)\\s+(.*?)\\s+(oo|OO|Len:|LEN:)"); 100 Pattern removewhitespace = Pattern.compile("\\s"); 101 // REMatch rem = null; 102 String line = reader.readLine(); 103 // parse past header 104 while (line.toUpperCase().indexOf("NAME:") == -1) { 105 line = reader.readLine(); 106 } 107 // read each name (between Name: and Len: 108 while ((line.indexOf("//") == -1) && ((line.trim()).length() != 0)) { 109 Matcher matcher = mtc.matcher(line); 110 if (!matcher.find()) { 111 break; 112 } // end of sequence names 113 // sequenceName = line.substring(rem.getSubStartIndex(1), 114 // rem.getSubEndIndex(1)); 115 if ((line.trim()).length() == 0) { 116 break; 117 } 118 sequenceName = matcher.group(2).trim(); 119 sequenceNames.add(sequenceName); 120 121 line = reader.readLine(); 122 } 123 sequenceData = new StringBuffer[sequenceNames.size()]; 124 for (int it = 0; it < sequenceNames.size(); it++) { 125 sequenceData[it] = new StringBuffer(); 126 } 127 // until you get a line that matches the first sequence 128 while (line.indexOf((String) sequenceNames.get(0)) == -1) { 129 line = reader.readLine(); 130 } 131 // now you on the first line of the sequence data 132 while (line != null) { 133 for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) {// you 134 // could 135 // also 136 // check 137 // for 138 // order 139 // of 140 // names 141 if (line.indexOf((String) sequenceNames.get(currSeqCount)) == -1) { 142 break; 143 } // error 144 145 startOfData = line.indexOf((String) sequenceNames 146 .get(currSeqCount)) 147 + ((String) sequenceNames.get(currSeqCount)) 148 .length(); 149 line = (line.substring(startOfData)); 150 line = removewhitespace.matcher(line).replaceAll(""); 151 sequenceData[currSeqCount].append(line); // make into string 152 // buffer 153 line = reader.readLine(); 154 if ((currSeqCount < sequenceNames.size() - 1) 155 && (line.trim().length() == 0)) { 156 break; 157 } // could be an error 158 } 159 // until you get a line that matches the first sequence 160 while ((line != null) 161 && (line.indexOf((String) sequenceNames.get(0)) == -1)) // || 162 // ( 163 // (line.trim()) 164 // .length()>0 165 // ) 166 // ) 167 { 168 line = reader.readLine(); 169 } 170 } 171 // print them out for testing 172 if (DEBUGPRINT) { 173 for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) { 174 System.out.println((String) sequenceNames.get(currSeqCount) 175 + ":" + sequenceData[currSeqCount]); 176 } 177 } 178 // check DNA, RNA or Prot 179 StringBuffer testString = new StringBuffer(); 180 for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) { 181 testString.append(sequenceData[currSeqCount]); 182 } 183 String testStringUpper = testString.toString().toUpperCase(); 184 185 // now parse through them and create gapped symbol lists 186 LinkedHashMap sequenceDataMap = new LinkedHashMap(); 187 FiniteAlphabet alph = null; 188 189 for (int i = 0; i < testStringUpper.length(); i++) { 190 char c = testStringUpper.charAt(i); 191 if (c == 'F' || c == 'L' || c == 'I' || c == 'P' || c == 'Q' 192 || c == 'E') { 193 alph = ProteinTools.getTAlphabet(); 194 break; 195 } 196 } 197 if (alph == null) { 198 alph = DNATools.getDNA(); 199 } 200 for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) { 201 String sd = sequenceData[currSeqCount].toString(); 202 // change stop codons to specified symbols 203 sd = sd.replace('~', '-'); // sometimes this is a term signal 204 // not a gap 205 sd = sd.replace('.', '-'); // sometimes this is a term signal 206 // not a gap 207 sequenceDataMap.put((String) sequenceNames.get(currSeqCount), 208 alph == ProteinTools.getTAlphabet() ? ProteinTools 209 .createGappedProteinSequence(sd, 210 (String) sequenceNames 211 .get(currSeqCount)) : DNATools 212 .createGappedDNASequence(sd, 213 (String) sequenceNames 214 .get(currSeqCount))); 215 } 216 SimpleAlignment sa = new SimpleAlignment(sequenceDataMap); 217 return (sa); 218 } catch (Exception e) { 219 e.printStackTrace(); 220 System.err.println("MSFFormatReader " + e.getMessage()); 221 // throw (e); 222 } 223 return (null); 224 } // end read it 225 226 // This is where I am writing an alignment writer 227 public void write(OutputStream os, Alignment align, int fileType) 228 throws BioException, IllegalSymbolException { 229 PrintStream out = new PrintStream(os); 230 Object labels[] = align.getLabels().toArray(); 231 int numSeqs = labels.length; 232 Iterator<?> seqIts[] = new Iterator<?>[numSeqs]; 233 int maxLabelLength = 0; 234 for (int i = 0; i < numSeqs; i++) { 235 seqIts[i] = align.symbolListForLabel(labels[i].toString()) 236 .iterator(); 237 if (((String) labels[i]).length() > maxLabelLength) { 238 maxLabelLength = ((String) labels[i]).length(); 239 } 240 } 241 String nl = System.getProperty("line.separator"); 242 SymbolTokenization toke = null; 243 244 // really should determine the filetype based on one of the seqeunces 245 // alphabet 246 247 if (align.symbolListForLabel(labels[0].toString()).getAlphabet() == DNATools 248 .getDNA()) { 249 fileType = DNA; 250 251 } else if (align.symbolListForLabel(labels[0].toString()).getAlphabet() == ProteinTools 252 .getAlphabet() 253 || align.symbolListForLabel(labels[0].toString()).getAlphabet() == ProteinTools 254 .getTAlphabet()) { 255 fileType = PROTEIN; 256 } 257 258 if (fileType == DNA) { 259 out.print("PileUp" + nl); 260 out.print(nl); 261 out.print(" MSF: " + align.length() + " Type: "); 262 out.print("N"); 263 out.print(" Check: " + 0 + " .." + nl); 264 toke = DNATools.getDNA().getTokenization("token"); 265 } else if (fileType == PROTEIN) { 266 out.print("PileUp" + nl); 267 out.print(nl); 268 out.print(" MSF: " + align.length() + " Type: "); 269 out.print("P"); 270 out.print(" Check: " + 0 + " .." + nl); 271 toke = ProteinTools.getTAlphabet().getTokenization("token"); 272 } else { 273 System.out 274 .println("MSFAlignment.write -- File type not recognized."); 275 return; 276 } 277 out.print(nl); 278 279 for (int i = 0; i < numSeqs; i++) { 280 out.print(" Name: " + labels[i]); 281 for (int j = 0; j < (maxLabelLength - ((String) labels[i]).length()); j++) {// padding 282 out.print(" "); 283 } 284 out.print(" Len: " + align.length() + " Check: " + 0 285 + " Weight: " + 0 + nl); // this really should be seq 286 // length? 287 } 288 289 out.println(nl + "//" + nl + nl); 290 // now should print the numbering line 291 292 while (seqIts[0].hasNext()) { 293 for (int i = 0; i < numSeqs; i++) { 294 while (((String) labels[i]).length() < maxLabelLength + 1) { 295 labels[i] = " " + labels[i]; 296 } 297 out.print(labels[i] + " "); 298 theLabel: for (int j = 0; j < 5; j++) { 299 out.print(" "); 300 for (int k = 0; k < 10; k++) { 301 if (seqIts[i].hasNext()) { 302 out.print(toke.tokenizeSymbol((Symbol) seqIts[i] 303 .next())); 304 } else { 305 break theLabel; 306 } 307 } 308 } 309 out.print(nl); 310 } 311 out.print(nl); 312 } 313 314 } // end write 315 316 public void writeDna(OutputStream os, Alignment align) throws BioException, 317 IllegalSymbolException { 318 write(os, align, DNA); 319 } 320 321 public void writeProtein(OutputStream os, Alignment align) 322 throws BioException, IllegalSymbolException { 323 write(os, align, PROTEIN); 324 } 325 326} // end class 327