001/*
002 * @(#)ORonn.java 1.0 June 2010
003 *
004 * Copyright (c) 2010 Peter Troshin
005 *
006 *        BioJava development code
007 *
008 * This code may be freely distributed and modified under the
009 * terms of the GNU Lesser General Public Licence.  This should
010 * be distributed with the code.  If you do not have a copy,
011 * see:
012 *
013 *      http://www.gnu.org/copyleft/lesser.html
014 *
015 * Copyright for this code is held jointly by the individual
016 * authors.  These should be listed in @author doc comments.
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 */
025package org.biojava.nbio.ronn;
026
027import org.biojava.nbio.data.sequence.FastaSequence;
028import org.biojava.nbio.data.sequence.SequenceUtil;
029import org.biojava.nbio.ronn.ModelLoader.Model;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import java.io.FileInputStream;
034import java.io.IOException;
035import java.io.PrintWriter;
036import java.text.DateFormat;
037import java.text.NumberFormat;
038import java.util.Date;
039import java.util.List;
040import java.util.Locale;
041import java.util.concurrent.*;
042
043
044/**
045 * Fully re-factored and enhanced version of RONN.
046 *
047 * This class does the calculation and contains the main for the command line client.
048 *
049 * @author Peter Troshin
050 * @version 1.0
051 * @since 3.0.2
052
053 * TODO refactor
054 */
055public final class ORonn implements Callable<ORonn> {
056
057        private static final Logger logger = LoggerFactory.getLogger(ORonn.class);
058
059        private static final DateFormat DATE_FORMAT = DateFormat
060                        .getDateTimeInstance(DateFormat.LONG, DateFormat.LONG, Locale.US);
061
062        private static final NumberFormat nformat = NumberFormat.getInstance();
063        static {
064                ORonn.nformat.setMaximumFractionDigits(2);
065        }
066
067
068        static final byte NUMBER_OF_MODELS = 10;
069        private final FastaSequence sequence;
070        private final ModelLoader mloader;
071        private final PrintWriter out;
072        private final ResultLayout layout;
073        private final PrintWriter stat;
074        private final Timer timer;
075        private final float disorder;
076
077        // This gets initialized after calling a call method!
078        private float[] cummulativeScore;
079
080
081        ORonn(final FastaSequence sequence, final ModelLoader mloader,
082                        final InputParameters params) throws NumberFormatException,
083                        IOException {
084                this.sequence = sequence;
085                this.mloader = mloader;
086                out = params.getOutputWriter();
087                assert out != null;
088                layout = params.getFormat();
089                stat = params.getStatWriter();
090                disorder = params.getDisorder();
091                timer = new Timer(TimeUnit.MILLISECONDS);
092        }
093        //This constructor is for API calls where the caller collects the results directly
094        ORonn(final FastaSequence sequence, final ModelLoader mloader) throws NumberFormatException,
095        IOException {
096                this.sequence = sequence;
097                this.mloader = mloader;
098                out = new PrintWriter(new NullOutputStream());
099                layout = ResultLayout.HORIZONTAL;
100                stat = new PrintWriter(new NullOutputStream());
101                disorder = RonnConstraint.DEFAULT_DISORDER;
102                timer = new Timer(TimeUnit.MILLISECONDS);
103        }
104
105        void writeResults(final float[] meanScores, final char[] seqs) {
106
107                synchronized (out)
108                {
109                        out.println(">" + sequence.getId());
110                        if (layout == ResultLayout.VERTICAL) {
111                                for (int i = 0; i < meanScores.length; i++) {
112                                        out.printf("%c\t%.2f%n", seqs[i], meanScores[i]);
113                                        //out.printf("%c\t%f%n", seqs[i], meanScores[i]);
114                                }
115                        } else {
116                                final StringBuilder seqLine = new StringBuilder();
117                                final StringBuilder resultLine = new StringBuilder();
118                                final String spacer = "\t";
119                                for (int i = 0; i < meanScores.length; i++) {
120                                        seqLine.append(seqs[i]);
121                                        seqLine.append(spacer);
122                                        resultLine.append(ORonn.nformat.format(meanScores[i]));
123                                        resultLine.append(spacer);
124                                }
125                                out.println(seqLine.toString());
126                                out.println(resultLine.toString());
127                        }
128                        out.println();
129                        out.flush();
130                }
131        }
132
133        static boolean isValidSequence(final FastaSequence fsequence) {
134                assert fsequence != null;
135                return fsequence.getLength() > RonnConstraint.MIN_SEQUENCE_LENGTH;
136        }
137
138        @Override
139        public ORonn call() throws NumberFormatException, IOException {
140                final String seq = sequence.getSequence();
141                // Calculate for each model
142                for (int m = 0; m < ORonn.NUMBER_OF_MODELS; m++) {
143                        final Model model = mloader.getModel(m);
144                        final ORonnModel rmodel = new ORonnModel(seq, model, disorder);
145                        final float[] scores = rmodel.detect();
146                        addScore(scores);
147                }
148
149                final char[] ch = seq.toCharArray();
150                final float[] meanScores = getMeanScores();
151                assert meanScores.length == seq.length() : "Scores are not calculated for "
152                                + "all residues!";
153                writeResults(meanScores, ch);
154                stat.println(timer.getTotalTime() + "ms prediction completed for "
155                                + sequence.getId());
156                return this;
157        }
158
159        private void addScore(final float[] scores) {
160                // For the first time just add all elements
161                if (cummulativeScore == null) {
162                        cummulativeScore = scores;
163                        return;
164                }
165                if (cummulativeScore.length != scores.length) {
166                        throw new IllegalArgumentException("Expected "
167                                        + cummulativeScore.length + " but get " + scores.length);
168                }
169                for (int i = 0; i < scores.length; i++) {
170                        cummulativeScore[i] += scores[i];
171                }
172        }
173
174        float[] getMeanScores() {
175                final float[] meanScores = new float[cummulativeScore.length];
176                for (int i = 0; i < cummulativeScore.length; i++) {
177                        meanScores[i] = cummulativeScore[i] / ORonn.NUMBER_OF_MODELS;
178                }
179                return meanScores;
180        }
181
182        /**
183         *
184         * @author pvtroshin
185         *
186         * VERTICAL - where the letters of the sequence and corresponding disorder values are
187         * output in two column layout.
188         *
189         * HORIZONTAL where the disorder values are provided under the letters of the
190         * sequence. Letters and values separated by tabulation in      this case.
191         *
192         */
193        static enum ResultLayout {
194                VERTICAL, HORIZONTAL
195        }
196
197        static void printUsage() {
198                logger.error(RonnConstraint.HELP_MESSAGE);
199        }
200
201        static boolean isValidSequenceForRonn(final FastaSequence fsequence,
202                        final PrintWriter stat) {
203                boolean valid = true;
204                String message = "";
205                if (!ORonn.isValidSequence(fsequence)) {
206                        message = "IGNORING sequence "
207                                        + fsequence.getId()
208                                        + " as its too short. Minimum sequence length for disorder prediction is "
209                                        + (RonnConstraint.MIN_SEQUENCE_LENGTH + 1) + " characters!";
210                        stat.println(message);
211                        logger.warn(message);
212                        valid = false;
213                }
214                final String sequence = fsequence.getSequence();
215                if (!(SequenceUtil.isProteinSequence(sequence) || SequenceUtil
216                                .isAmbiguosProtein(sequence))) {
217                        message = "IGNORING sequence " + fsequence.getId()
218                                        + " as it is not a protein sequence!";
219                        stat.println(message);
220                        logger.warn(message);
221                        valid = false;
222                }
223                return valid;
224        }
225
226        static void validateSequenceForRonn(final FastaSequence fsequence) {
227
228                String message = "";
229                if (!ORonn.isValidSequence(fsequence)) {
230                        message = "IGNORING sequence "
231                                        + fsequence.getId()
232                                        + " as its too short. Minimum sequence length for disorder prediction is "
233                                        + (RonnConstraint.MIN_SEQUENCE_LENGTH + 1) + " characters!";
234                        throw new IllegalArgumentException(message);
235                }
236                final String sequence = fsequence.getSequence();
237
238                if ( SequenceUtil.isAmbiguosProtein(sequence)){
239                        logger.warn("Sequence is ambiguous!");
240                }
241
242                if (!(SequenceUtil.isProteinSequence(sequence) )){
243                        logger.warn("Does not look like a protein sequence!");
244                }
245
246                if (!(SequenceUtil.isProteinSequence(sequence) || SequenceUtil
247                                .isAmbiguosProtein(sequence))) {
248                        message = "IGNORING sequence " + fsequence.getId()
249                                        + " as it is not a protein sequence!";
250                        throw new IllegalArgumentException(message);
251                }
252        }
253
254        private static InputParameters parseArguments(final String[] args)
255                        throws IOException {
256                final InputParameters prms = new InputParameters();
257                for (int i = 0; i < args.length; i++) {
258                        final String prm = args[i].trim().toLowerCase();
259                        if (prm.startsWith(InputParameters.inputKey)) {
260                                prms.setFilePrm(args[i], InputParameters.inputKey);
261                        }
262                        if (prm.startsWith(InputParameters.outputKey)) {
263                                prms.setFilePrm(args[i], InputParameters.outputKey);
264                        }
265                        if (prm.startsWith(InputParameters.disorderKey)) {
266                                prms.setDisorder(prm);
267                        }
268                        if (prm.startsWith(InputParameters.formatKey)) {
269                                prms.setFormat(prm);
270                        }
271                        if (prm.startsWith(InputParameters.statKey)) {
272                                prms.setFilePrm(args[i], InputParameters.statKey);
273                        }
274                        if (prm.startsWith(InputParameters.threadKey)) {
275                                prms.setThreadNum(prm);
276                        }
277
278                }
279                return prms;
280        }
281
282        public static void main(final String[] args) throws NumberFormatException,
283        IOException {
284
285                if ((args.length == 0) || (args.length > 5)) {
286                        ORonn.printUsage();
287                        System.exit(1);
288                }
289                final InputParameters prms = ORonn.parseArguments(args);
290
291                final PrintWriter stat = prms.getStatWriter();
292                stat.println("Using parameters: \n[" + prms + "]");
293
294                if (prms.getInput() == null) {
295                        logger.error("Input is not defined! ");
296                        ORonn.printUsage();
297                        System.exit(1);
298                }
299                stat.println("Calculation started: "
300                                + ORonn.DATE_FORMAT.format(new Date()));
301
302                final Timer timer = new Timer();
303                // The stream is closed after reading inside readFasta
304                final List<FastaSequence> sequences = SequenceUtil
305                                .readFasta(new FileInputStream(prms.getInput()));
306                stat.println(timer.getStepTime(TimeUnit.MILLISECONDS)
307                                + "ms input file loaded");
308                stat.println("Input file has " + sequences.size() + " sequences");
309
310                final ModelLoader mloader = new ModelLoader();
311                mloader.loadModels();
312
313                final PrintWriter out = prms.getOutputWriter();
314                assert out != null;
315
316                // do serial execution
317                if (prms.getThreadNum() == 1) {
318                        stat.println("Running predictions serially");
319                        ORonn.predictSerial(sequences, prms, mloader);
320                } else {
321                        // Run predictions in parallel
322                        stat.print("Running preditions in parallel - ");
323                        stat.println("Using " + prms.getThreadNum() + " threads");
324                        ORonn.predictParallel(sequences, prms, mloader);
325                }
326
327                stat.println("Total calculation time: " + timer.getTotalTime() + "s ");
328                stat.println("Calculation completed: "
329                                + ORonn.DATE_FORMAT.format(new Date()));
330                stat.close();
331                out.flush();
332                out.close();
333        }
334
335        static void predictSerial(final List<FastaSequence> fsequences,
336                        final InputParameters prms, final ModelLoader mloader)
337                                        throws NumberFormatException, IOException {
338                for (final FastaSequence sequence : fsequences) {
339                        if (!ORonn.isValidSequenceForRonn(sequence, prms.getStatWriter())) {
340                                continue;
341                        }
342                        final ORonn ronn = new ORonn(sequence, mloader, prms);
343                        ronn.call();
344                }
345        }
346
347
348        static void predictParallel(final List<FastaSequence> fsequences,
349                        final InputParameters prms, final ModelLoader mloader)
350                                        throws NumberFormatException, IOException {
351                final PrintWriter stat = prms.getStatWriter();
352
353                // Do parallel execution
354                final ExecutorService executor = new ThreadPoolExecutor(prms
355                                .getThreadNum(), prms.getThreadNum(), 0L, TimeUnit.SECONDS,
356                                new SynchronousQueue<Runnable>(),
357                                new ThreadPoolExecutor.CallerRunsPolicy());
358                try {
359                        for (final FastaSequence sequence : fsequences) {
360                                if (!ORonn.isValidSequenceForRonn(sequence, stat)) {
361                                        continue;
362                                }
363                                final ORonn ronn = new ORonn(sequence, mloader, prms);
364                                /*
365                                 * To get stack traces from tasks one need to obtain a Future
366                                 * from this method and call its get() method. Otherwise some
367                                 * task may end up with exception but unnoticed
368                                 */
369                                executor.submit(ronn);
370                        }
371                        executor.shutdown();
372                        final int timeOut = (fsequences.size() < 60) ? 60 : fsequences
373                                        .size();
374                        stat.println("All task submitted. Waiting for complition for "
375                                        + "maximum of " + timeOut + " minutes");
376                        executor.awaitTermination(timeOut, TimeUnit.MINUTES);
377                } catch (final InterruptedException e) {
378                        logger.error("Execution is terminated! "
379                                        + "Terminated by either by the system or the timeout. "
380                                        + "Maximum of 1 minute is allowed for one sequence analisys! "
381                                        + "If it took longer to complite this analysis "
382                                        + "the program is terminated.", e);
383                } finally {
384                        executor.shutdownNow();
385                }
386        }
387
388} // class end