001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.io.BufferedReader;
025import java.io.FileReader;
026import java.io.OutputStream;
027import java.io.PrintStream;
028import java.util.Iterator;
029import java.util.LinkedHashMap;
030import java.util.Vector;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.biojava.bio.BioException;
035import org.biojava.bio.alignment.Alignment;
036import org.biojava.bio.alignment.SimpleAlignment;
037import org.biojava.bio.seq.DNATools;
038import org.biojava.bio.seq.ProteinTools;
039import org.biojava.bio.symbol.FiniteAlphabet;
040import org.biojava.bio.symbol.IllegalSymbolException;
041import org.biojava.bio.symbol.Symbol;
042import org.biojava.bio.symbol.SymbolList;
043
044/**
045 * @author raemig
046 * @author Thomas Down
047 * @author Keith James
048 * @author Nimesh Singh
049 * @author Mark Schreiber
050 * @author Matthew Pocock
051 * @author Bradford Powell
052 */
053
054public class MSFAlignmentFormat implements AlignmentFormat {
055        private static final boolean DEBUGPRINT = false;
056        private static final int DNA = 1;
057        private static final int PROTEIN = 2;
058
059        public MSFAlignmentFormat() {
060        }
061
062        /**
063         * used to quick test the code
064         * 
065         * @param args
066         */
067        public static void main(String[] args) {
068                String filename;
069                if (args.length < 1) {
070                        filename = "SimpleMSF.msf"; // change to your favorite
071                } else {
072                        filename = args[0];
073                }
074                try {
075                        BufferedReader reader = new BufferedReader(new FileReader(filename));
076                        MSFAlignmentFormat MSFAlignmentFormat1 = new MSFAlignmentFormat();
077                        MSFAlignmentFormat1.read(reader);
078                } catch (Exception E) {
079                }
080        }
081
082        /**
083         * Reads an MSF Alignment File
084         * 
085         * @param reader
086         *            The file reader
087         * @return Alignment A SimpleAlignment consisting of the sequences in the
088         *         file.
089         */
090        public Alignment read(BufferedReader reader) {
091                Vector sequenceNames = new Vector();
092                String sequenceName = null;
093                StringBuffer sequenceData[] = null;
094                int startOfData = 0; // the start of the sequence data in the line
095                int currSeqCount = 0; // which sequence data you are currently trying to
096                // get
097                try {
098                        Pattern mtc = Pattern
099                                        .compile("(Name:|NAME:)\\s+(.*?)\\s+(oo|OO|Len:|LEN:)");
100                        Pattern removewhitespace = Pattern.compile("\\s");
101                        // REMatch rem = null;
102                        String line = reader.readLine();
103                        // parse past header
104                        while (line.toUpperCase().indexOf("NAME:") == -1) {
105                                line = reader.readLine();
106                        }
107                        // read each name (between Name: and Len:
108                        while ((line.indexOf("//") == -1) && ((line.trim()).length() != 0)) {
109                                Matcher matcher = mtc.matcher(line);
110                                if (!matcher.find()) {
111                                        break;
112                                } // end of sequence names
113                                // sequenceName = line.substring(rem.getSubStartIndex(1),
114                                // rem.getSubEndIndex(1));
115                                if ((line.trim()).length() == 0) {
116                                        break;
117                                }
118                                sequenceName = matcher.group(2).trim();
119                                sequenceNames.add(sequenceName);
120
121                                line = reader.readLine();
122                        }
123                        sequenceData = new StringBuffer[sequenceNames.size()];
124                        for (int it = 0; it < sequenceNames.size(); it++) {
125                                sequenceData[it] = new StringBuffer();
126                        }
127                        // until you get a line that matches the first sequence
128                        while (line.indexOf((String) sequenceNames.get(0)) == -1) {
129                                line = reader.readLine();
130                        }
131                        // now you on the first line of the sequence data
132                        while (line != null) {
133                                for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) {// you
134                                        // could
135                                        // also
136                                        // check
137                                        // for
138                                        // order
139                                        // of
140                                        // names
141                                        if (line.indexOf((String) sequenceNames.get(currSeqCount)) == -1) {
142                                                break;
143                                        } // error
144
145                                        startOfData = line.indexOf((String) sequenceNames
146                                                        .get(currSeqCount))
147                                                        + ((String) sequenceNames.get(currSeqCount))
148                                                                        .length();
149                                        line = (line.substring(startOfData));
150                                        line = removewhitespace.matcher(line).replaceAll("");
151                                        sequenceData[currSeqCount].append(line); // make into string
152                                        // buffer
153                                        line = reader.readLine();
154                                        if ((currSeqCount < sequenceNames.size() - 1)
155                                                        && (line.trim().length() == 0)) {
156                                                break;
157                                        } // could be an error
158                                }
159                                // until you get a line that matches the first sequence
160                                while ((line != null)
161                                                && (line.indexOf((String) sequenceNames.get(0)) == -1)) // ||
162                                // (
163                                // (line.trim())
164                                // .length()>0
165                                // )
166                                // )
167                                {
168                                        line = reader.readLine();
169                                }
170                        }
171                        // print them out for testing
172                        if (DEBUGPRINT) {
173                                for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) {
174                                        System.out.println((String) sequenceNames.get(currSeqCount)
175                                                        + ":" + sequenceData[currSeqCount]);
176                                }
177                        }
178                        // check DNA, RNA or Prot
179                        StringBuffer testString = new StringBuffer();
180                        for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) {
181                                testString.append(sequenceData[currSeqCount]);
182                        }
183                        String testStringUpper = testString.toString().toUpperCase();
184
185                        // now parse through them and create gapped symbol lists
186                        LinkedHashMap sequenceDataMap = new LinkedHashMap();
187                        FiniteAlphabet alph = null;
188
189                        for (int i = 0; i < testStringUpper.length(); i++) {
190                                char c = testStringUpper.charAt(i);
191                                if (c == 'F' || c == 'L' || c == 'I' || c == 'P' || c == 'Q'
192                                                || c == 'E') {
193                                        alph = ProteinTools.getTAlphabet();
194                                        break;
195                                }
196                        }
197                        if (alph == null) {
198                                alph = DNATools.getDNA();
199                        }
200                        for (currSeqCount = 0; currSeqCount < sequenceNames.size(); currSeqCount++) {
201                                String sd = sequenceData[currSeqCount].toString();
202                                // change stop codons to specified symbols
203                                sd = sd.replace('~', '-'); // sometimes this is a term signal
204                                // not a gap
205                                sd = sd.replace('.', '-'); // sometimes this is a term signal
206                                // not a gap
207                                sequenceDataMap.put((String) sequenceNames.get(currSeqCount),
208                                                alph == ProteinTools.getTAlphabet() ? ProteinTools
209                                                                .createGappedProteinSequence(sd,
210                                                                                (String) sequenceNames
211                                                                                                .get(currSeqCount)) : DNATools
212                                                                .createGappedDNASequence(sd,
213                                                                                (String) sequenceNames
214                                                                                                .get(currSeqCount)));
215                        }
216                        SimpleAlignment sa = new SimpleAlignment(sequenceDataMap);
217                        return (sa);
218                } catch (Exception e) {
219                        e.printStackTrace();
220                        System.err.println("MSFFormatReader " + e.getMessage());
221                        // throw (e);
222                }
223                return (null);
224        } // end read it
225
226        // This is where I am writing an alignment writer
227        public void write(OutputStream os, Alignment align, int fileType)
228                        throws BioException, IllegalSymbolException {
229                PrintStream out = new PrintStream(os);
230                Object labels[] = align.getLabels().toArray();
231                int numSeqs = labels.length;
232                Iterator<?> seqIts[] = new Iterator<?>[numSeqs];
233                int maxLabelLength = 0;
234                for (int i = 0; i < numSeqs; i++) {
235                        seqIts[i] = align.symbolListForLabel(labels[i].toString())
236                                        .iterator();
237                        if (((String) labels[i]).length() > maxLabelLength) {
238                                maxLabelLength = ((String) labels[i]).length();
239                        }
240                }
241                String nl = System.getProperty("line.separator");
242                SymbolTokenization toke = null;
243
244                // really should determine the filetype based on one of the seqeunces
245                // alphabet
246
247                if (align.symbolListForLabel(labels[0].toString()).getAlphabet() == DNATools
248                                .getDNA()) {
249                        fileType = DNA;
250
251                } else if (align.symbolListForLabel(labels[0].toString()).getAlphabet() == ProteinTools
252                                .getAlphabet()
253                                || align.symbolListForLabel(labels[0].toString()).getAlphabet() == ProteinTools
254                                                .getTAlphabet()) {
255                        fileType = PROTEIN;
256                }
257
258                if (fileType == DNA) {
259                        out.print("PileUp" + nl);
260                        out.print(nl);
261                        out.print(" MSF: " + align.length() + "  Type: ");
262                        out.print("N");
263                        out.print("   Check: " + 0 + "   .." + nl);
264                        toke = DNATools.getDNA().getTokenization("token");
265                } else if (fileType == PROTEIN) {
266                        out.print("PileUp" + nl);
267                        out.print(nl);
268                        out.print(" MSF: " + align.length() + "  Type: ");
269                        out.print("P");
270                        out.print("   Check: " + 0 + "   .." + nl);
271                        toke = ProteinTools.getTAlphabet().getTokenization("token");
272                } else {
273                        System.out
274                                        .println("MSFAlignment.write -- File type not recognized.");
275                        return;
276                }
277                out.print(nl);
278
279                for (int i = 0; i < numSeqs; i++) {
280                        out.print(" Name: " + labels[i]);
281                        for (int j = 0; j < (maxLabelLength - ((String) labels[i]).length()); j++) {// padding
282                                out.print(" ");
283                        }
284                        out.print("  Len: " + align.length() + "        Check: " + 0
285                                        + "     Weight: " + 0 + nl); // this really should be seq
286                        // length?
287                }
288
289                out.println(nl + "//" + nl + nl);
290                // now should print the numbering line
291
292                while (seqIts[0].hasNext()) {
293                        for (int i = 0; i < numSeqs; i++) {
294                                while (((String) labels[i]).length() < maxLabelLength + 1) {
295                                        labels[i] = " " + labels[i];
296                                }
297                                out.print(labels[i] + " ");
298                                theLabel: for (int j = 0; j < 5; j++) {
299                                        out.print(" ");
300                                        for (int k = 0; k < 10; k++) {
301                                                if (seqIts[i].hasNext()) {
302                                                        out.print(toke.tokenizeSymbol((Symbol) seqIts[i]
303                                                                        .next()));
304                                                } else {
305                                                        break theLabel;
306                                                }
307                                        }
308                                }
309                                out.print(nl);
310                        }
311                        out.print(nl);
312                }
313
314        } // end write
315
316        public void writeDna(OutputStream os, Alignment align) throws BioException,
317                        IllegalSymbolException {
318                write(os, align, DNA);
319        }
320
321        public void writeProtein(OutputStream os, Alignment align)
322                        throws BioException, IllegalSymbolException {
323                write(os, align, PROTEIN);
324        }
325
326} // end class
327