001/*
002 * @(#)ORonn.java 1.0 June 2010
003 *
004 * Copyright (c) 2010 Peter Troshin
005 *
006 *        BioJava development code
007 *
008 * This code may be freely distributed and modified under the
009 * terms of the GNU Lesser General Public Licence.  This should
010 * be distributed with the code.  If you do not have a copy,
011 * see:
012 *
013 *      http://www.gnu.org/copyleft/lesser.html
014 *
015 * Copyright for this code is held jointly by the individual
016 * authors.  These should be listed in @author doc comments.
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 */
025package org.biojava.nbio.ronn;
026
027import org.biojava.nbio.data.sequence.FastaSequence;
028import org.biojava.nbio.data.sequence.SequenceUtil;
029import org.biojava.nbio.ronn.ModelLoader.Model;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import java.io.FileInputStream;
034import java.io.IOException;
035import java.io.PrintWriter;
036import java.text.DateFormat;
037import java.text.NumberFormat;
038import java.util.Date;
039import java.util.List;
040import java.util.Locale;
041import java.util.concurrent.*;
042import java.util.stream.IntStream;
043import java.util.stream.Stream;
044
045
046/**
047 * Fully re-factored and enhanced version of RONN.
048 *
049 * This class does the calculation and contains the main for the command line client.
050 *
051 * @author Peter Troshin
052 * @version 1.0
053 * @since 3.0.2
054
055 * TODO refactor
056 */
057public final class ORonn implements Callable<ORonn> {
058
059        private static final Logger logger = LoggerFactory.getLogger(ORonn.class);
060
061        private static final DateFormat DATE_FORMAT = DateFormat
062                        .getDateTimeInstance(DateFormat.LONG, DateFormat.LONG, Locale.US);
063
064        private static final NumberFormat nformat = NumberFormat.getInstance();
065        static {
066                ORonn.nformat.setMaximumFractionDigits(2);
067        }
068
069
070        static final byte NUMBER_OF_MODELS = 10;
071        private final FastaSequence sequence;
072        private final ModelLoader mloader;
073        private final PrintWriter out;
074        private final ResultLayout layout;
075        private final PrintWriter stat;
076        private final Timer timer;
077        private final float disorder;
078
079        // This gets initialized after calling a call method!
080        private float[] cummulativeScore;
081
082
083        ORonn(final FastaSequence sequence, final ModelLoader mloader,
084                        final InputParameters params) throws NumberFormatException,
085                        IOException {
086                this.sequence = sequence;
087                this.mloader = mloader;
088                out = params.getOutputWriter();
089                assert out != null;
090                layout = params.getFormat();
091                stat = params.getStatWriter();
092                disorder = params.getDisorder();
093                timer = new Timer(TimeUnit.MILLISECONDS);
094        }
095        //This constructor is for API calls where the caller collects the results directly
096        ORonn(final FastaSequence sequence, final ModelLoader mloader) throws NumberFormatException,
097        IOException {
098                this.sequence = sequence;
099                this.mloader = mloader;
100                out = new PrintWriter(new NullOutputStream());
101                layout = ResultLayout.HORIZONTAL;
102                stat = new PrintWriter(new NullOutputStream());
103                disorder = RonnConstraint.DEFAULT_DISORDER;
104                timer = new Timer(TimeUnit.MILLISECONDS);
105        }
106
107        void writeResults(final float[] meanScores, final char[] seqs) {
108
109                synchronized (out)
110                {
111                        out.println(">" + sequence.getId());
112                        if (layout == ResultLayout.VERTICAL) {
113                                for (int i = 0; i < meanScores.length; i++) {
114                                        out.printf(Locale.US, "%c\t%.2f%n", seqs[i], meanScores[i]);
115                                        //out.printf(Locale.US, "%c\t%f%n", seqs[i], meanScores[i]);
116                                }
117                        } else {
118                                final StringBuilder seqLine = new StringBuilder();
119                                final StringBuilder resultLine = new StringBuilder();
120                                final String spacer = "\t";
121                                for (int i = 0; i < meanScores.length; i++) {
122                                        seqLine.append(seqs[i]);
123                                        seqLine.append(spacer);
124                                        resultLine.append(ORonn.nformat.format(meanScores[i]));
125                                        resultLine.append(spacer);
126                                }
127                                out.println(seqLine.toString());
128                                out.println(resultLine.toString());
129                        }
130                        out.println();
131                        out.flush();
132                }
133        }
134
135        static boolean isValidSequence(final FastaSequence fsequence) {
136                assert fsequence != null;
137                return fsequence.getLength() > RonnConstraint.MIN_SEQUENCE_LENGTH;
138        }
139
140        @Override
141        public ORonn call() throws NumberFormatException, IOException {
142                final String seq = sequence.getSequence();
143                // Calculate for each model
144                Stream.iterate(0, n -> n +1).limit(NUMBER_OF_MODELS).map(modelNumber -> mloader.getModel(modelNumber))
145                                                                                                                                 .map(rmodel -> new ORonnModel(seq, rmodel, disorder).detect())
146                                                                                                                                 .forEach(score ->addScore(score));
147                final char[] ch = seq.toCharArray();
148                final float[] meanScores = getMeanScores();
149                assert meanScores.length == seq.length() : "Scores are not calculated for "
150                                + "all residues!";
151                writeResults(meanScores, ch);
152                stat.println(timer.getTotalTime() + "ms prediction completed for "
153                                + sequence.getId());
154                return this;
155        }
156
157        private void addScore(final float[] scores) {
158                // For the first time just add all elements
159                if (cummulativeScore == null) {
160                        cummulativeScore = scores;
161                        return;
162                }
163                if (cummulativeScore.length != scores.length) {
164                        throw new IllegalArgumentException("Expected "
165                                        + cummulativeScore.length + " but get " + scores.length);
166                }
167                for (int i = 0; i < scores.length; i++) {
168                        cummulativeScore[i] += scores[i];
169                }
170        }
171
172        float[] getMeanScores() {
173                final float[] meanScores = new float[cummulativeScore.length];
174                for (int i = 0; i < cummulativeScore.length; i++) {
175                        meanScores[i] = cummulativeScore[i] / ORonn.NUMBER_OF_MODELS;
176                }
177                return meanScores;
178        }
179
180        /**
181         *
182         * @author pvtroshin
183         *
184         * VERTICAL - where the letters of the sequence and corresponding disorder values are
185         * output in two column layout.
186         *
187         * HORIZONTAL where the disorder values are provided under the letters of the
188         * sequence. Letters and values separated by tabulation in      this case.
189         *
190         */
191        static enum ResultLayout {
192                VERTICAL, HORIZONTAL
193        }
194
195        static void printUsage() {
196                logger.error(RonnConstraint.HELP_MESSAGE);
197        }
198
199        static boolean isValidSequenceForRonn(final FastaSequence fsequence,
200                        final PrintWriter stat) {
201                boolean valid = true;
202                String message = "";
203                if (!ORonn.isValidSequence(fsequence)) {
204                        message = "IGNORING sequence "
205                                        + fsequence.getId()
206                                        + " as its too short. Minimum sequence length for disorder prediction is "
207                                        + (RonnConstraint.MIN_SEQUENCE_LENGTH + 1) + " characters!";
208                        stat.println(message);
209                        logger.warn(message);
210                        valid = false;
211                }
212                final String sequence = fsequence.getSequence();
213                if (!(SequenceUtil.isProteinSequence(sequence) || SequenceUtil
214                                .isAmbiguosProtein(sequence))) {
215                        message = "IGNORING sequence " + fsequence.getId()
216                                        + " as it is not a protein sequence!";
217                        stat.println(message);
218                        logger.warn(message);
219                        valid = false;
220                }
221                return valid;
222        }
223
224        static void validateSequenceForRonn(final FastaSequence fsequence) {
225
226                String message = "";
227                if (!ORonn.isValidSequence(fsequence)) {
228                        message = "IGNORING sequence "
229                                        + fsequence.getId()
230                                        + " as its too short. Minimum sequence length for disorder prediction is "
231                                        + (RonnConstraint.MIN_SEQUENCE_LENGTH + 1) + " characters!";
232                        throw new IllegalArgumentException(message);
233                }
234                final String sequence = fsequence.getSequence();
235
236                if ( SequenceUtil.isAmbiguosProtein(sequence)){
237                        logger.warn("Sequence is ambiguous!");
238                }
239
240                if (!(SequenceUtil.isProteinSequence(sequence) )){
241                        logger.warn("Does not look like a protein sequence!");
242                }
243
244                if (!(SequenceUtil.isProteinSequence(sequence) || SequenceUtil
245                                .isAmbiguosProtein(sequence))) {
246                        message = "IGNORING sequence " + fsequence.getId()
247                                        + " as it is not a protein sequence!";
248                        throw new IllegalArgumentException(message);
249                }
250        }
251
252        private static InputParameters parseArguments(final String[] args)
253                        throws IOException {
254                final InputParameters prms = new InputParameters();
255                for (int i = 0; i < args.length; i++) {
256                        final String prm = args[i].trim().toLowerCase();
257                        if (prm.startsWith(InputParameters.inputKey)) {
258                                prms.setFilePrm(args[i], InputParameters.inputKey);
259                        }
260                        if (prm.startsWith(InputParameters.outputKey)) {
261                                prms.setFilePrm(args[i], InputParameters.outputKey);
262                        }
263                        if (prm.startsWith(InputParameters.disorderKey)) {
264                                prms.setDisorder(prm);
265                        }
266                        if (prm.startsWith(InputParameters.formatKey)) {
267                                prms.setFormat(prm);
268                        }
269                        if (prm.startsWith(InputParameters.statKey)) {
270                                prms.setFilePrm(args[i], InputParameters.statKey);
271                        }
272                        if (prm.startsWith(InputParameters.threadKey)) {
273                                prms.setThreadNum(prm);
274                        }
275
276                }
277                return prms;
278        }
279
280        public static void main(final String[] args) throws NumberFormatException,
281        IOException {
282
283                if ((args.length == 0) || (args.length > 5)) {
284                        ORonn.printUsage();
285                        System.exit(1);
286                }
287                final InputParameters prms = ORonn.parseArguments(args);
288
289                final PrintWriter stat = prms.getStatWriter();
290                stat.println("Using parameters: \n[" + prms + "]");
291
292                if (prms.getInput() == null) {
293                        logger.error("Input is not defined! ");
294                        ORonn.printUsage();
295                        System.exit(1);
296                }
297                stat.println("Calculation started: "
298                                + ORonn.DATE_FORMAT.format(new Date()));
299
300                final Timer timer = new Timer();
301                // The stream is closed after reading inside readFasta
302                final List<FastaSequence> sequences = SequenceUtil
303                                .readFasta(new FileInputStream(prms.getInput()));
304                stat.println(timer.getStepTime(TimeUnit.MILLISECONDS)
305                                + "ms input file loaded");
306                stat.println("Input file has " + sequences.size() + " sequences");
307
308                final ModelLoader mloader = new ModelLoader();
309                mloader.loadModels();
310
311                final PrintWriter out = prms.getOutputWriter();
312                assert out != null;
313
314                // do serial execution
315                if (prms.getThreadNum() == 1) {
316                        stat.println("Running predictions serially");
317                        ORonn.predictSerial(sequences, prms, mloader);
318                } else {
319                        // Run predictions in parallel
320                        stat.print("Running preditions in parallel - ");
321                        stat.println("Using " + prms.getThreadNum() + " threads");
322                        ORonn.predictParallel(sequences, prms, mloader);
323                }
324
325                stat.println("Total calculation time: " + timer.getTotalTime() + "s ");
326                stat.println("Calculation completed: "
327                                + ORonn.DATE_FORMAT.format(new Date()));
328                stat.close();
329                out.flush();
330                out.close();
331        }
332
333        static void predictSerial(final List<FastaSequence> fsequences,
334                        final InputParameters prms, final ModelLoader mloader)
335                                        throws NumberFormatException, IOException {
336                for (final FastaSequence sequence : fsequences) {
337                        if (!ORonn.isValidSequenceForRonn(sequence, prms.getStatWriter())) {
338                                continue;
339                        }
340                        final ORonn ronn = new ORonn(sequence, mloader, prms);
341                        ronn.call();
342                }
343        }
344
345
346        static void predictParallel(final List<FastaSequence> fsequences,
347                        final InputParameters prms, final ModelLoader mloader)
348                                        throws NumberFormatException, IOException {
349                final PrintWriter stat = prms.getStatWriter();
350
351                // Do parallel execution
352                final ExecutorService executor = new ThreadPoolExecutor(prms
353                                .getThreadNum(), prms.getThreadNum(), 0L, TimeUnit.SECONDS,
354                                new SynchronousQueue<Runnable>(),
355                                new ThreadPoolExecutor.CallerRunsPolicy());
356                try {
357                        for (final FastaSequence sequence : fsequences) {
358                                if (!ORonn.isValidSequenceForRonn(sequence, stat)) {
359                                        continue;
360                                }
361                                final ORonn ronn = new ORonn(sequence, mloader, prms);
362                                /*
363                                 * To get stack traces from tasks one need to obtain a Future
364                                 * from this method and call its get() method. Otherwise some
365                                 * task may end up with exception but unnoticed
366                                 */
367                                executor.submit(ronn);
368                        }
369                        executor.shutdown();
370                        final int timeOut = (fsequences.size() < 60) ? 60 : fsequences
371                                        .size();
372                        stat.println("All task submitted. Waiting for complition for "
373                                        + "maximum of " + timeOut + " minutes");
374                        executor.awaitTermination(timeOut, TimeUnit.MINUTES);
375                } catch (final InterruptedException e) {
376                        logger.error("Execution is terminated! "
377                                        + "Terminated by either by the system or the timeout. "
378                                        + "Maximum of 1 minute is allowed for one sequence analisys! "
379                                        + "If it took longer to complite this analysis "
380                                        + "the program is terminated.", e);
381                } finally {
382                        executor.shutdownNow();
383                }
384        }
385
386} // class end