001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.util;
022
023import com.google.common.collect.Range;
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
026import org.biojava.nbio.core.sequence.template.SequenceView;
027import org.biojava.nbio.genome.parsers.genename.ChromPos;
028import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
029import org.biojava.nbio.genome.parsers.twobit.TwoBitFacade;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import java.io.StringWriter;
034import java.util.ArrayList;
035import java.util.List;
036
037/**
038 *  A class that can map chromosomal positions to mRNA (coding sequence) positions.
039 *
040 *  @author Andreas Prlic
041 */
042
043public class ChromosomeMappingTools {
044
045        private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class);
046
047        private static final String newline = System.getProperty("line.separator");
048
049        public static final String CHROMOSOME = "CHROMOSOME";
050        public static final String CDS = "CDS";
051
052        private static int base = 1;
053        public static void setCoordinateSystem(int baseInt) {
054                base = baseInt;
055        }
056
057        /**
058         * Pretty print the details of a GeneChromosomePosition to a String
059         *
060         * @param chromosomePosition
061         * @return
062         */
063        public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){
064                if ( chromosomePosition.getOrientation() == '+')
065                        return formatExonStructureForward(chromosomePosition);
066                return formatExonStructureReverse(chromosomePosition);
067        }
068
069        private static String formatExonStructureForward(GeneChromosomePosition chromPos) {
070
071                StringWriter s = new StringWriter();
072
073                List<Integer> exonStarts = chromPos.getExonStarts();
074                List<Integer> exonEnds   = chromPos.getExonEnds();
075
076                int cdsStart = chromPos.getCdsStart();
077                int cdsEnd   = chromPos.getCdsEnd();
078
079                boolean inCoding = false;
080                int codingLength = 0;
081
082                for (int i = 0; i < exonStarts.size(); i++) {
083
084                        int start = exonStarts.get(i);
085                        int end = exonEnds.get(i);
086
087                        if (start <= cdsStart +1 && end >= cdsStart+1) {
088
089                                inCoding = true;
090                                codingLength += (end - cdsStart);
091                                s.append("     UTR         : ").append(format(start)).append(" - ").append(format(cdsStart));
092                                s.append(newline);
093                                s.append(" ->  Exon        : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
094                                s.append(newline);
095
096                        } else if (start <= cdsEnd && end >= cdsEnd) {
097                                //logger.debug(" <-- CDS end at: " + cdsEnd );
098                                inCoding = false;
099                                codingLength += (cdsEnd - start);
100
101                                s.append(" <-  Exon        : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
102                                s.append(newline);
103                                s.append("     UTR         : " + (cdsEnd +1) + " - " + format(end));
104                                s.append(newline);
105
106                        } else if (inCoding) {
107                                // full exon is coding
108                                codingLength += (end - start);
109
110                                s.append("     Exon        : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
111                                s.append(newline);
112                        }
113                }
114                s.append("Coding Length: ");
115                s.append((codingLength-3)+"");
116                s.append(newline);
117                return s.toString();
118        }
119
120        private static String formatExonStructureReverse(GeneChromosomePosition chromPos) {
121                StringWriter s = new StringWriter();
122
123                List<Integer> exonStarts = chromPos.getExonStarts();
124                List<Integer> exonEnds   = chromPos.getExonEnds();
125
126
127                int cdsStart = chromPos.getCdsStart();
128                int cdsEnd   = chromPos.getCdsEnd();
129
130                // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd));
131
132                boolean inCoding = false;
133                int codingLength = 0;
134
135                if (cdsEnd < cdsStart) {
136                        int tmp = cdsEnd;
137                        cdsEnd = cdsStart;
138                        cdsStart = tmp;
139                }
140
141                // map reverse
142                for (int i = exonStarts.size() - 1; i >= 0; i--) {
143
144                        int end = exonStarts.get(i);
145                        int start = exonEnds.get(i);
146
147                        if (end < start) {
148                                int tmp = end;
149                                end = start;
150                                start = tmp;
151                        }
152
153                        if (start <= cdsEnd && end >= cdsEnd) {
154                                inCoding = true;
155
156                                int tmpstart = start;
157                                if (start < cdsStart) {
158                                        tmpstart = cdsStart;
159                                }
160                                codingLength += (cdsEnd - tmpstart);
161
162                                s.append("     UTR         :" + format(cdsEnd + 1) + " | " + format(end));
163                                s.append(newline);
164                                if (tmpstart == start)
165                                        s.append(" ->  ");
166                                else
167                                        s.append(" <-> ");
168                                s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
169                                s.append(newline);
170                                // single exon with UTR on both ends
171                                if (tmpstart != start)
172                                        s.append("     UTR         :" + format(cdsStart ) + " - " + format(start + 1));
173                                s.append(newline);
174
175                        } else if (start <= cdsStart && end >= cdsStart) {
176                                inCoding = false;
177                                codingLength += (end - cdsStart);
178
179                                s.append(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
180                                s.append(newline);
181                                s.append("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
182                                s.append(newline);
183
184
185                        } else if (inCoding) {
186                                // full exon is coding
187                                codingLength += (end - start);
188
189                                s.append("     Exon        : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
190                                s.append(newline);
191                        } else {
192                                // e.g. see UBQLN3
193                                s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end));
194                                s.append(newline);
195                        }
196                }
197
198                s.append("CDS length: ").append(Integer.toString(codingLength - 3));
199                s.append(newline);
200
201                return s.toString();
202        }
203
204        /**
205         * Get the length of the CDS in nucleotides.
206         *
207         * @param chromPos
208         * @return length of the CDS in nucleotides.
209         */
210        public static int getCDSLength(GeneChromosomePosition chromPos) {
211
212                List<Integer> exonStarts = chromPos.getExonStarts();
213                List<Integer> exonEnds = chromPos.getExonEnds();
214
215                int cdsStart = chromPos.getCdsStart();
216                int cdsEnd = chromPos.getCdsEnd();
217
218                int codingLength;
219                if (chromPos.getOrientation().equals('+'))
220                        codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd);
221                else
222                        codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd);
223                return codingLength;
224        }
225
226        /**
227         * Maps the position of a CDS nucleotide back to the genome
228         *
229         * @param cdsNucleotidePosition
230         * @return a ChromPos object
231         */
232        public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) {
233
234                logger.debug(" ? Checking chromosome position for CDS position {}", cdsNucleotidePosition);
235
236                List<Integer> exonStarts = chromPos.getExonStarts();
237                List<Integer> exonEnds = chromPos.getExonEnds();
238
239                logger.debug(" Exons:{}", exonStarts.size());
240
241                int cdsStart = chromPos.getCdsStart();
242                int cdsEnd = chromPos.getCdsEnd();
243
244
245                ChromPos chromosomePos = null;
246
247                if (chromPos.getOrientation().equals('+'))
248
249                        chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
250                else
251                        chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
252
253                logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at  " + chromosomePos);
254                return chromosomePos;
255
256        }
257
258        /**
259         * Returns a nicely formatted representation of the position
260         *
261         * @param chromosomePosition
262         * @return
263         */
264        private static String format(int chromosomePosition){
265                return String.format("%,d", chromosomePosition);
266        }
267
268        /**
269         * Get the CDS position mapped on the chromosome position
270         *
271         * @param exonStarts
272         * @param exonEnds
273         * @param cdsStart
274         * @param cdsEnd
275         * @return
276         */
277        public static ChromPos getChromPosReverse(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
278
279                boolean inCoding = false;
280                int codingLength = 0;
281
282                if (cdsEnd < cdsStart) {
283                        int tmp = cdsEnd;
284                        cdsEnd = cdsStart;
285                        cdsStart = tmp;
286                }
287
288                int lengthExons = 0;
289
290                // map reverse
291                for (int i = exonStarts.size() - 1; i >= 0; i--) {
292
293                        logger.debug("Exon #" + (i+1) + "/" + exonStarts.size());
294                        int end = exonStarts.get(i);
295                        int start = exonEnds.get(i);
296
297                        if (end < start) {
298                                int tmp = end;
299                                end = start;
300                                start = tmp;
301                        }
302                        lengthExons += end - start;
303
304                        logger.debug("     is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1));
305                        logger.debug("     CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength);
306
307                        if (start+1 <= cdsEnd && end >= cdsEnd) {
308
309                                // FIRST EXON
310                                inCoding = true;
311
312                                int tmpstart = start;
313                                if (start < cdsStart) {
314                                        tmpstart = cdsStart;
315                                }
316
317                                // here one of the few places where we don't say start+1
318                                int check = codingLength + cdsEnd - tmpstart ;
319
320                                logger.debug("First Exon    | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos );
321
322
323                                if ( ( check > cdsPos)  )  {
324                                        int tmp = cdsPos - codingLength ;
325                                        logger.debug(" -> found position in UTR exon:  " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength);
326                                        return new ChromPos((cdsEnd - tmp), -1) ;
327                                }
328
329                                // don't add 1 here
330                                codingLength += (cdsEnd - tmpstart );
331
332                                boolean debug = logger.isDebugEnabled();
333
334                                if ( debug ) {
335
336                                        StringBuffer b = new StringBuffer();
337
338                                        b.append("     UTR         :" + format(cdsEnd + 1) + " - " + format(end) + newline);
339                                        if (tmpstart == start)
340                                                b.append(" ->  ");
341                                        else
342                                                b.append(" <-> ");
343                                        b.append("Exon        :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline);
344
345                                        // single exon with UTR on both ends
346                                        if (tmpstart != start)
347                                                b.append("     UTR         :" + format(cdsStart) + " - " + format(start + 1) + newline);
348
349                                        logger.debug(b.toString());
350                                }
351                        } else if (start <= cdsStart && end >= cdsStart) {
352
353                                // LAST EXON
354                                inCoding = false;
355
356                                if (codingLength + end - cdsStart >= cdsPos) {
357
358                                        // how many remaining coding nucleotides?
359                                        int tmp = codingLength + end - cdsStart - cdsPos ;
360
361                                        logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart));
362
363                                        logger.debug(" -> XXX found position noncoding exon:  cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd));
364
365                                        return new ChromPos((cdsStart + tmp),-1);
366                                }
367
368                                codingLength += (end - cdsStart);
369
370                                logger.debug(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
371                                logger.debug("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
372
373                        } else if (inCoding) {
374
375                                if (codingLength + end - start -1  >= cdsPos) {
376
377                                        int tmp = cdsPos - codingLength ;
378
379                                        if ( tmp > (end - start ) ) {
380                                                tmp = (end - start );
381                                                logger.debug("changing tmp to {}", tmp);
382                                        }
383                                        logger.debug("     " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp);
384                                        logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
385                                        logger.debug(" ->  RRR found position coding exon:  " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength);
386
387                                        return new ChromPos((end - tmp),cdsPos %3);
388                                }
389                                // full exon is coding
390                                codingLength += (end - start) ;
391
392                                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
393                        } else {
394                                // e.g. see UBQLN3
395                                logger.debug(" no translation!");
396                        }
397                        logger.debug("     coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos);
398                }
399
400                logger.debug("length exons: {}", lengthExons);
401                // could not map, or map over the full length??
402                return new ChromPos(-1,-1);
403
404        }
405
406        /**
407         * Get the CDS position mapped onto the chromosome position
408         *
409         * @param exonStarts
410         * @param exonEnds
411         * @param cdsStart
412         * @param cdsEnd
413         * @return
414         */
415        public static ChromPos getChromPosForward(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
416                boolean inCoding = false;
417                int codingLength = 0;
418
419                @SuppressWarnings("unused")
420                int lengthExons = 0;
421                // map forward
422                for (int i = 0; i < exonStarts.size(); i++) {
423
424                        // start can include UTR
425                        int start = exonStarts.get(i);
426                        int end = exonEnds.get(i);
427
428                        lengthExons += end - start;
429
430                        if (start <= cdsStart +1 && end >= cdsStart+1) {
431                                // first exon with UTR
432                                if (codingLength + (end - cdsStart-1) >= cdsPos) {
433                                        // we are reaching our target position
434                                        int tmp = cdsPos - codingLength;
435
436
437                                        logger.debug(cdsStart + " | " + codingLength + " | " + tmp);
438                                        logger.debug(" -> found position in UTR exon:  #"+(i+1)+ " cdsPos:" + cdsPos +
439                                                        " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength);
440
441                                        // we start 1 after cdsStart...
442                                        return new ChromPos((cdsStart +1 + tmp),-1);
443                                }
444                                inCoding = true;
445                                codingLength += (end - cdsStart);
446
447                                logger.debug("     UTR         : " + format(start+1) + " - " + (cdsStart ));
448                                logger.debug(" ->  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
449
450                        } else if (start+1 <= cdsEnd && end >= cdsEnd) {
451                                // LAST EXON with UTR
452                                //logger.debug(" <-- CDS end at: " + cdsEnd );
453                                inCoding = false;
454                                if (codingLength + (cdsEnd - start-1) >= cdsPos) {
455                                        int tmp = cdsPos - codingLength;
456
457                                        logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
458                                        logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
459                                        logger.debug( codingLength + " | " + tmp + " | " + format(start+1));
460                                        logger.debug(" -> chromPosForward found position in non coding exon:  " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength);
461
462                                        return new ChromPos((start +1 + tmp),cdsPos%3);
463                                }
464                                codingLength += (cdsEnd - start-1);
465
466                                logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
467                                logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
468
469
470                        } else if (inCoding) {
471                                // A standard coding Exon
472                                // tests for the maximum length of this coding exon
473                                if (codingLength + (end - start -1)  >= cdsPos) {
474
475                                        // we are within the range of this exon
476                                        int tmp = cdsPos - codingLength ;
477
478                                        logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength);
479                                        logger.debug(" -> found chr position in coding exon #" + (i+1) + ":  cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength);
480
481                                        return new ChromPos((start +1 + tmp),cdsPos%3);
482                                }
483                                // full exon is coding
484                                codingLength += (end - start );
485
486                                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
487                        }
488                }
489                return new ChromPos(-1,-1);
490        }
491
492        /**
493         * Get the length of the coding sequence
494         *
495         * @param exonStarts
496         * @param exonEnds
497         * @param cdsStart
498         * @param cdsEnd
499         * @return
500         */
501        public static int getCDSLengthReverse(List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
502
503                int codingLength = 0;
504
505                if (cdsEnd < cdsStart) {
506                        int tmp = cdsEnd;
507                        cdsEnd = cdsStart;
508                        cdsStart = tmp;
509                }
510                cdsStart = cdsStart + base;
511
512                // map reverse
513                for (int i = exonStarts.size() - 1; i >= 0; i--) {
514
515                        int end = exonStarts.get(i);
516                        int start = exonEnds.get(i);
517
518                        if (end < start) {
519                                int tmp = end;
520                                end = start;
521                                start = tmp;
522                        }
523                        start = start + base;
524
525                        if ((start < cdsStart && end < cdsStart) || (start > cdsEnd && end > cdsEnd))
526                                continue;
527
528                        if (start < cdsStart)
529                                start = cdsStart;
530
531                        if (end > cdsEnd)
532                                end = cdsEnd;
533
534                        codingLength += (end - start + 1);
535                }
536                return codingLength - 3;
537        }
538
539        /**
540         * Get the length of the coding sequence
541         *
542         * @param exonStarts
543         * @param exonEnds
544         * @param cdsStart
545         * @param cdsEnd
546         * @return
547         */
548        public static int getCDSLengthForward(List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
549
550                int codingLength = 0;
551
552                for (int i = 0; i < exonStarts.size(); i++) {
553
554                        int start = exonStarts.get(i)+base;
555                        int end = exonEnds.get(i);
556
557                        if ( (start < cdsStart+base && end < cdsStart) || (start > cdsEnd && end > cdsEnd) )
558                                continue;
559
560                        if (start < cdsStart+base)
561                                start = cdsStart+base;
562
563                        if (end > cdsEnd)
564                                end = cdsEnd;
565
566                        codingLength += (end - start + 1);
567                }
568                return codingLength-3 ;
569        }
570
571        /**
572         * Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions)
573         *
574         * @param chromPos
575         * @return
576         */
577        public static List<Range<Integer>> getCDSExonRanges(GeneChromosomePosition chromPos){
578                if ( chromPos.getOrientation() == '+')
579                        return getCDSExonRangesForward(chromPos,CDS);
580                return getCDSExonRangesReverse(chromPos,CDS);
581        }
582
583        /** Extracts the boundaries of the coding regions in chromosomal coordinates
584         *
585         * @param chromPos
586         * @return
587         */
588        public static List<Range<Integer>> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){
589                if ( chromPos.getOrientation() == '+')
590                        return getCDSExonRangesForward(chromPos,CHROMOSOME);
591                return getCDSExonRangesReverse(chromPos,CHROMOSOME);
592        }
593
594        private static List<Range<Integer>> getCDSExonRangesReverse(GeneChromosomePosition chromPos, String responseType) {
595
596                List<Integer> exonStarts = chromPos.getExonStarts();
597                List<Integer> exonEnds   = chromPos.getExonEnds();
598
599                List<Range<Integer>> data = new ArrayList<>();
600                int cdsStart = chromPos.getCdsStart();
601                int cdsEnd   = chromPos.getCdsEnd();
602
603                boolean inCoding = false;
604                int codingLength = 0;
605
606                if (cdsEnd < cdsStart) {
607                        int tmp = cdsEnd;
608                        cdsEnd = cdsStart;
609                        cdsStart = tmp;
610                }
611
612                java.lang.StringBuffer s =null;
613
614                boolean debug = logger.isDebugEnabled();
615
616                if ( debug)
617                        s = new StringBuffer();
618
619                //int lengthExons = 0;
620
621                // map reverse
622                for (int i = exonStarts.size() - 1; i >= 0; i--) {
623
624                        int end = exonStarts.get(i);
625                        int start = exonEnds.get(i);
626
627                        if (end < start) {
628                                int tmp = end;
629                                end = start;
630                                start = tmp;
631                        }
632                        //lengthExons += end - start;
633                        //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start));
634                        //s.append(newline);
635
636                        if (start <= cdsEnd && end >= cdsEnd) {
637                                inCoding = true;
638
639                                int tmpstart = start;
640                                if (start < cdsStart) {
641                                        tmpstart = cdsStart;
642                                }
643                                codingLength += (cdsEnd - tmpstart);
644                                if ( debug ) {
645                                        s.append("     UTR         :").append(format(cdsEnd + 1)).append(" | ").append(format(end));
646                                        s.append(newline);
647                                        if (tmpstart == start)
648                                                s.append(" ->  ");
649                                        else
650                                                s.append(" <-> ");
651                                        s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3);
652                                        s.append(newline);
653                                        // single exon with UTR on both ends
654                                        if (tmpstart != start)
655                                                s.append("     UTR         :").append(format(cdsStart)).append(" - ").append(format(start + 1));
656                                        s.append(newline);
657                                }
658
659
660                                Range<Integer> r ;
661                                if ( responseType.equals(CDS))
662                                        r = Range.closed(0,codingLength);
663                                else
664                                        r = Range.closed(tmpstart,cdsEnd);
665
666                                data.add(r);
667
668                        } else if (start <= cdsStart && end >= cdsStart) {
669                                inCoding = false;
670
671                                Range<Integer> r;
672                                if ( responseType.equals(CDS))
673                                        r = Range.closed(codingLength,codingLength+(end-cdsStart));
674                                else
675                                        r = Range.closed(cdsStart+1,end);
676
677                                data.add(r);
678
679                                codingLength += (end - cdsStart);
680                                if  (debug) {
681                                        s.append(" <-  Exon        : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
682                                        s.append(newline);
683                                        s.append("     UTR         : ").append(format(start + 1)).append(" - ").append(format(cdsStart));
684                                        s.append(newline);
685                                }
686                        } else if (inCoding) {
687                                // full exon is coding
688                                Range<Integer> r;
689                                if ( responseType.equals(CDS))
690                                        r = Range.closed(codingLength,codingLength+(end-start));
691                                else
692                                        r = Range.closed(start,end);
693                                data.add(r);
694
695                                codingLength += (end - start);
696                                if  (debug) {
697                                        s.append("     Exon        : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
698                                        s.append(newline);
699                                }
700                        } else {
701                                // e.g. see UBQLN3
702                                if ( debug ) {
703                                        s.append(" no translation! UTR: " + format(start) + " - " + format(end));
704                                        s.append(newline);
705                                }
706                        }
707                }
708                if ( debug ) {
709                        s.append("CDS length: ").append(Integer.toString(codingLength - 3));
710                        s.append(newline);
711                        logger.debug(s.toString());
712                }
713
714                return data;
715        }
716
717        private static List<Range<Integer>> getCDSExonRangesForward(GeneChromosomePosition chromPos, String responseType) {
718
719                List<Range<Integer>> data = new ArrayList<>();
720                List<Integer> exonStarts = chromPos.getExonStarts();
721                List<Integer> exonEnds   = chromPos.getExonEnds();
722
723                int cdsStart = chromPos.getCdsStart();
724                int cdsEnd   = chromPos.getCdsEnd();
725
726                boolean inCoding = false;
727                int codingLength = 0;
728
729                for (int i = 0; i < exonStarts.size(); i++) {
730
731                        int start = exonStarts.get(i);
732                        int end = exonEnds.get(i);
733
734                        if (start <= cdsStart && end >= cdsStart) {
735
736                                inCoding = true;
737                                codingLength += (end - cdsStart);
738
739                                Range<Integer> r;
740                                if ( responseType.equals(CDS))
741                                        r = Range.closed(0,codingLength);
742                                else
743                                        r = Range.closed(cdsStart,end);
744                                data.add(r);
745
746                        } else if (start <= cdsEnd && end >= cdsEnd) {
747                                //logger.debug(" <-- CDS end at: " + cdsEnd );
748                                inCoding = false;
749
750                                Range<Integer> r;
751                                if ( responseType.equals(CDS))
752                                        r = Range.closed(codingLength,codingLength+(cdsEnd-start));
753                                else
754                                        r = Range.closed(start,cdsEnd);
755                                data.add(r);
756                                codingLength += (cdsEnd - start);
757
758                        } else if (inCoding) {
759                                // full exon is coding
760                                Range<Integer> r;
761                                if ( responseType.equals(CDS))
762                                        r = Range.closed(codingLength,codingLength+(end-start));
763                                else
764                                        r = Range.closed(start,end);
765                                data.add(r);
766                                codingLength += (end - start);
767                        }
768                }
769                return data;
770        }
771
772        /**
773         * I have a genomic coordinate, where is it on the mRNA
774         *
775         * @param coordinate
776         * @param chromosomePosition
777         * @return
778         */
779        public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) {
780
781                if ( chromosomePosition.getOrientation() == '+')
782                        return getCDSPosForward(coordinate,
783                                        chromosomePosition.getExonStarts(),
784                                        chromosomePosition.getExonEnds(),
785                                        chromosomePosition.getCdsStart(),
786                                        chromosomePosition.getCdsEnd());
787
788                return getCDSPosReverse(coordinate,
789                                chromosomePosition.getExonStarts(),
790                                chromosomePosition.getExonEnds(),
791                                chromosomePosition.getCdsStart(),
792                                chromosomePosition.getCdsEnd());
793        }
794
795        /**
796         * Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
797         * living on the forward DNA strand.
798         *
799         * @param chromPos The genetic coordinate on a chromosome
800         * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
801         * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
802         * @param cdsStart The start position of a coding region
803         * @param cdsEnd The end position of a coding region
804         *
805         * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
806         *
807         * @author Yana Valasatava
808         */
809        public static int getCDSPosForward(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds,
810                        int cdsStart, int cdsEnd) {
811
812                // the genetic coordinate is not in a coding region
813                if ( (chromPos < (cdsStart+base) ) || ( chromPos > (cdsEnd+base) ) ) {
814                        logger.debug("The {} position is not in a coding region", format(chromPos));
815                        return -1;
816                }
817
818                logger.debug("looking for CDS position for {}", format(chromPos));
819
820                // map the genetic coordinates of coding region on a stretch of a reverse strand
821                List<Range<Integer>> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
822
823                int codingLength = 0;
824                int lengthExon = 0;
825                for (Range<Integer> range : cdsRegions) {
826
827                        int start = range.lowerEndpoint();
828                        int end = range.upperEndpoint();
829
830                        lengthExon = end - start;
831
832                        if (start+base <= chromPos && end >= chromPos ) {
833                                return codingLength + (chromPos-start);
834                        }
835                        else {
836                                codingLength += lengthExon;
837                        }
838                }
839                return -1;
840        }
841
842        /**
843         * Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
844         * living on the reverse DNA strand.
845         *
846         * @param chromPos The genetic coordinate on a chromosome
847         * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
848         * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
849         * @param cdsStart The start position of a coding region
850         * @param cdsEnd The end position of a coding region
851         *
852         * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
853         *
854         * @author Yana Valasatava
855         */
856        public static int getCDSPosReverse(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds,
857                        int cdsStart, int cdsEnd) {
858
859                // the genetic coordinate is not in a coding region
860                if ( (chromPos < (cdsStart+base)) || ( chromPos > (cdsEnd+base) ) ) {
861                        logger.debug("The {} position is not in a coding region", format(chromPos));
862                        return -1;
863                }
864
865                logger.debug("looking for CDS position for {}", format(chromPos));
866
867                // map the genetic coordinate on a stretch of a reverse strand
868                List<Range<Integer>> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
869
870                int codingLength = 0;
871                int lengthExon = 0;
872                for ( int i=cdsRegions.size()-1; i>=0; i-- ) {
873
874                        int start = cdsRegions.get(i).lowerEndpoint();
875                        int end = cdsRegions.get(i).upperEndpoint();
876
877                        lengthExon = end - start;
878                        // +1 offset to be a base 1
879                        if (start+base <= chromPos && end >= chromPos ) {
880                                return codingLength + (end-chromPos+1);
881                        }
882                        else {
883                                codingLength += lengthExon;
884                        }
885                }
886                return -1;
887        }
888
889        /**
890         * Extracts the exons boundaries in CDS coordinates corresponding to the forward DNA strand.
891         *
892         * @param origExonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
893         * @param origExonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
894         * @param cdsStart The start position of a coding region
895         * @param cdsEnd The end position of a coding region
896         *
897         * @return the list of genetic positions corresponding to the exons boundaries in CDS coordinates
898         */
899        public static List<Range<Integer>> getCDSRegions(List<Integer> origExonStarts, List<Integer> origExonEnds, int cdsStart, int cdsEnd) {
900
901                // remove exons that are fully landed in UTRs
902                List<Integer> exonStarts = new ArrayList<>(origExonStarts);
903                List<Integer> exonEnds = new ArrayList<>(origExonEnds);
904
905                int j=0;
906                for (int i = 0; i < origExonStarts.size(); i++) {
907                        if ( ( origExonEnds.get(i) < cdsStart) || ( origExonStarts.get(i) > cdsEnd) ) {
908                                exonStarts.remove(j);
909                                exonEnds.remove(j);
910                        }
911                        else {
912                                j++;
913                        }
914                }
915
916                // remove untranslated regions from exons
917                int nExons = exonStarts.size();
918                exonStarts.remove(0);
919                exonStarts.add(0, cdsStart);
920                exonEnds.remove(nExons-1);
921                exonEnds.add(cdsEnd);
922
923                List<Range<Integer>> cdsRegion = new ArrayList<>();
924                for ( int i=0; i<nExons; i++ ) {
925                        Range<Integer> r = Range.closed(exonStarts.get(i), exonEnds.get(i));
926                        cdsRegion.add(r);
927                }
928                return cdsRegion;
929        }
930
931        /**
932         * Extracts the DNA sequence transcribed from the input genetic coordinates.
933         *
934         * @param twoBitFacade the facade that provide an access to a 2bit file
935         * @param gcp The container with chromosomal positions
936         *
937         * @return the DNA sequence transcribed from the input genetic coordinates
938         */
939        public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, GeneChromosomePosition gcp) throws Exception {
940                return getTranscriptDNASequence(twoBitFacade,gcp.getChromosome(),gcp.getExonStarts(), gcp.getExonEnds(), gcp.getCdsStart(), gcp.getCdsEnd(), gcp.getOrientation());
941        }
942
943        /**
944         * Extracts the DNA sequence transcribed from the input genetic coordinates.
945         *
946         * @param chromosome the name of the chromosome
947         * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
948         * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
949         * @param cdsStart The start position of a coding region
950         * @param cdsEnd The end position of a coding region
951         * @param orientation The orientation of the strand where the gene is living
952         *
953         * @return the DNA sequence transcribed from the input genetic coordinates
954         */
955        public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, String chromosome, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd, Character orientation) throws Exception {
956
957                List<Range<Integer>> cdsRegion = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
958
959                StringBuilder dnaSequence = new StringBuilder();
960                for (Range<Integer> range : cdsRegion) {
961                        String exonSequence = twoBitFacade.getSequence(chromosome,range.lowerEndpoint(), range.upperEndpoint());
962                        dnaSequence.append(exonSequence);
963                }
964                if (orientation.equals('-')) {
965                        dnaSequence = new StringBuilder(new StringBuilder(dnaSequence.toString()).reverse().toString());
966                        DNASequence dna = new DNASequence(dnaSequence.toString());
967                        SequenceView<NucleotideCompound> compliment = dna.getComplement();
968                        dnaSequence = new StringBuilder(compliment.getSequenceAsString());
969                }
970                return new DNASequence(dnaSequence.toString().toUpperCase());
971        }
972}