001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.util;
022
023import com.google.common.collect.Range;
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
026import org.biojava.nbio.core.sequence.template.SequenceView;
027import org.biojava.nbio.genome.parsers.genename.ChromPos;
028import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
029import org.biojava.nbio.genome.parsers.twobit.TwoBitFacade;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import java.io.StringWriter;
034import java.util.ArrayList;
035import java.util.List;
036
037/**
038 *  A class that can map chromosomal positions to mRNA (coding sequence) positions.
039 *
040 *  @author Andreas Prlic
041 */
042
043public class ChromosomeMappingTools {
044
045    private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class);
046
047    private static final String newline = System.getProperty("line.separator");
048
049    public static final String CHROMOSOME = "CHROMOSOME";
050    public static final String CDS = "CDS";
051
052    private static int base = 1;
053    public static void setCoordinateSystem(int baseInt) {
054        base = baseInt;
055    }
056
057    /** 
058     * Pretty print the details of a GeneChromosomePosition to a String
059     *
060     * @param chromosomePosition
061     * @return
062     */
063    public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){
064        if ( chromosomePosition.getOrientation() == '+')
065            return formatExonStructureForward(chromosomePosition);
066        return formatExonStructureReverse(chromosomePosition);
067    }
068
069    private static String formatExonStructureForward(GeneChromosomePosition chromPos) {
070
071        StringWriter s = new StringWriter();
072
073        List<Integer> exonStarts = chromPos.getExonStarts();
074        List<Integer> exonEnds   = chromPos.getExonEnds();
075
076        int cdsStart = chromPos.getCdsStart();
077        int cdsEnd   = chromPos.getCdsEnd();
078
079        boolean inCoding = false;
080        int codingLength = 0;
081
082        for (int i = 0; i < exonStarts.size(); i++) {
083
084            int start = exonStarts.get(i);
085            int end = exonEnds.get(i);
086
087            if (start <= cdsStart +1 && end >= cdsStart+1) {
088
089                inCoding = true;
090                codingLength += (end - cdsStart);
091                s.append("     UTR         : ").append(format(start)).append(" - ").append(format(cdsStart));
092                s.append(newline);
093                s.append(" ->  Exon        : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
094                s.append(newline);
095
096            } else if (start <= cdsEnd && end >= cdsEnd) {
097                //logger.debug(" <-- CDS end at: " + cdsEnd );
098                inCoding = false;
099                codingLength += (cdsEnd - start);
100
101                s.append(" <-  Exon        : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
102                s.append(newline);
103                s.append("     UTR         : " + (cdsEnd +1) + " - " + format(end));
104                s.append(newline);
105
106            } else if (inCoding) {
107                // full exon is coding
108                codingLength += (end - start);
109
110                s.append("     Exon        : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
111                s.append(newline);
112            }
113        }
114        s.append("Coding Length: ");
115        s.append((codingLength-3)+"");
116        s.append(newline);
117        return s.toString();
118    }
119
120    private static String formatExonStructureReverse(GeneChromosomePosition chromPos) {
121        StringWriter s = new StringWriter();
122
123        List<Integer> exonStarts = chromPos.getExonStarts();
124        List<Integer> exonEnds   = chromPos.getExonEnds();
125
126
127        int cdsStart = chromPos.getCdsStart();
128        int cdsEnd   = chromPos.getCdsEnd();
129
130        // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd));
131
132        boolean inCoding = false;
133        int codingLength = 0;
134
135        if (cdsEnd < cdsStart) {
136            int tmp = cdsEnd;
137            cdsEnd = cdsStart;
138            cdsStart = tmp;
139        }
140
141        // map reverse
142        for (int i = exonStarts.size() - 1; i >= 0; i--) {
143
144            int end = exonStarts.get(i);
145            int start = exonEnds.get(i);
146
147            if (end < start) {
148                int tmp = end;
149                end = start;
150                start = tmp;
151            }
152
153            if (start <= cdsEnd && end >= cdsEnd) {
154                inCoding = true;
155
156                int tmpstart = start;
157                if (start < cdsStart) {
158                    tmpstart = cdsStart;
159                }
160                codingLength += (cdsEnd - tmpstart);
161
162                s.append("     UTR         :" + format(cdsEnd + 1) + " | " + format(end));
163                s.append(newline);
164                if (tmpstart == start)
165                    s.append(" ->  ");
166                else
167                    s.append(" <-> ");
168                s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
169                s.append(newline);
170                // single exon with UTR on both ends
171                if (tmpstart != start)
172                    s.append("     UTR         :" + format(cdsStart ) + " - " + format(start + 1));
173                s.append(newline);
174
175            } else if (start <= cdsStart && end >= cdsStart) {
176                inCoding = false;
177                codingLength += (end - cdsStart);
178
179                s.append(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
180                s.append(newline);
181                s.append("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
182                s.append(newline);
183
184
185            } else if (inCoding) {
186                // full exon is coding
187                codingLength += (end - start);
188
189                s.append("     Exon        : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
190                s.append(newline);
191            } else {
192                // e.g. see UBQLN3
193                s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end));
194                s.append(newline);
195            }
196        }
197
198        s.append("CDS length: ").append(Integer.toString(codingLength - 3));
199        s.append(newline);
200
201        return s.toString();
202    }
203
204    /**
205     * Get the length of the CDS in nucleotides.
206     *
207     * @param chromPos
208     * @return length of the CDS in nucleotides.
209     */
210    public static int getCDSLength(GeneChromosomePosition chromPos) {
211
212        List<Integer> exonStarts = chromPos.getExonStarts();
213        List<Integer> exonEnds = chromPos.getExonEnds();
214
215        int cdsStart = chromPos.getCdsStart();
216        int cdsEnd = chromPos.getCdsEnd();
217
218        int codingLength;
219        if (chromPos.getOrientation().equals('+'))
220            codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd);
221        else
222            codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd);
223        return codingLength;
224    }
225
226    /**
227     * Maps the position of a CDS nucleotide back to the genome
228     *
229     * @param cdsNucleotidePosition
230     * @return a ChromPos object
231     */
232    public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) {
233
234        logger.debug(" ? Checking chromosome position for CDS position " + cdsNucleotidePosition);
235
236        List<Integer> exonStarts = chromPos.getExonStarts();
237        List<Integer> exonEnds = chromPos.getExonEnds();
238
239        logger.debug(" Exons:" + exonStarts.size());
240
241        int cdsStart = chromPos.getCdsStart();
242        int cdsEnd = chromPos.getCdsEnd();
243
244
245        ChromPos chromosomePos = null;
246
247        if (chromPos.getOrientation().equals('+'))
248
249            chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
250        else
251            chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
252
253        logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at  " + chromosomePos);
254        return chromosomePos;
255
256    }
257
258    /** 
259     * Returns a nicely formatted representation of the position
260     *
261     * @param chromosomePosition
262     * @return
263     */
264    private static String format(int chromosomePosition){
265        return String.format("%,d", chromosomePosition);
266    }
267
268    /**
269     * Get the CDS position mapped on the chromosome position
270     *
271     * @param exonStarts
272     * @param exonEnds
273     * @param cdsStart
274     * @param cdsEnd
275     * @return
276     */
277    public static ChromPos getChromPosReverse(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
278
279        boolean inCoding = false;
280        int codingLength = 0;
281
282        if (cdsEnd < cdsStart) {
283            int tmp = cdsEnd;
284            cdsEnd = cdsStart;
285            cdsStart = tmp;
286        }
287
288        int lengthExons = 0;
289
290        // map reverse
291        for (int i = exonStarts.size() - 1; i >= 0; i--) {
292
293            logger.debug("Exon #" + (i+1) + "/" + exonStarts.size());
294            int end = exonStarts.get(i);
295            int start = exonEnds.get(i);
296
297            if (end < start) {
298                int tmp = end;
299                end = start;
300                start = tmp;
301            }
302            lengthExons += end - start;
303
304            logger.debug("     is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1));
305            logger.debug("     CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength);
306
307            if (start+1 <= cdsEnd && end >= cdsEnd) {
308
309                // FIRST EXON
310                inCoding = true;
311
312                int tmpstart = start;
313                if (start < cdsStart) {
314                    tmpstart = cdsStart;
315                }
316
317                // here one of the few places where we don't say start+1
318                int check = codingLength + cdsEnd - tmpstart ;
319
320                logger.debug("First Exon    | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos );
321
322
323                if ( ( check > cdsPos)  )  {
324                    int tmp = cdsPos - codingLength ;
325                    logger.debug(" -> found position in UTR exon:  " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength);
326                    return new ChromPos((cdsEnd - tmp), -1) ;
327                }
328
329                // don't add 1 here
330                codingLength += (cdsEnd - tmpstart );
331
332                boolean debug = logger.isDebugEnabled();
333
334                if ( debug ) {
335
336                    StringBuffer b = new StringBuffer();
337
338                    b.append("     UTR         :" + format(cdsEnd + 1) + " - " + format(end) + newline);
339                    if (tmpstart == start)
340                        b.append(" ->  ");
341                    else
342                        b.append(" <-> ");
343                    b.append("Exon        :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline);
344
345                    // single exon with UTR on both ends
346                    if (tmpstart != start)
347                        b.append("     UTR         :" + format(cdsStart) + " - " + format(start + 1) + newline);
348
349                    logger.debug(b.toString());
350                }
351            } else if (start <= cdsStart && end >= cdsStart) {
352
353                // LAST EXON
354                inCoding = false;
355
356                if (codingLength + end - cdsStart >= cdsPos) {
357
358                    // how many remaining coding nucleotides?
359                    int tmp = codingLength + end - cdsStart - cdsPos ;
360
361                    logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart));
362
363                    logger.debug(" -> XXX found position noncoding exon:  cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd));
364
365                    return new ChromPos((cdsStart + tmp),-1);
366                }
367
368                codingLength += (end - cdsStart);
369
370                logger.debug(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
371                logger.debug("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
372
373            } else if (inCoding) {
374
375                if (codingLength + end - start -1  >= cdsPos) {
376
377                    int tmp = cdsPos - codingLength ;
378
379                    if ( tmp > (end - start ) ) {
380                        tmp = (end - start );
381                        logger.debug("changing tmp to " + tmp);
382                    }
383                    logger.debug("     " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp);
384                    logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
385                    logger.debug(" ->  RRR found position coding exon:  " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength);
386
387                    return new ChromPos((end - tmp),cdsPos %3);
388                }
389                // full exon is coding
390                codingLength += (end - start) ;
391
392                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
393            } else {
394                // e.g. see UBQLN3
395                logger.debug(" no translation!");
396            }
397            logger.debug("     coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos);
398        }
399
400        logger.debug("length exons: " + lengthExons);
401        // could not map, or map over the full length??
402        return new ChromPos(-1,-1);
403
404    }
405
406    /**
407     * Get the CDS position mapped onto the chromosome position
408     *
409     * @param exonStarts
410     * @param exonEnds
411     * @param cdsStart
412     * @param cdsEnd
413     * @return
414     */
415    public static ChromPos getChromPosForward(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
416        boolean inCoding = false;
417        int codingLength = 0;
418
419        @SuppressWarnings("unused")
420                int lengthExons = 0;
421        // map forward
422        for (int i = 0; i < exonStarts.size(); i++) {
423
424            // start can include UTR
425            int start = exonStarts.get(i);
426            int end = exonEnds.get(i);
427
428            lengthExons += end - start;
429
430            if (start <= cdsStart +1 && end >= cdsStart+1) {
431                // first exon with UTR
432                if (codingLength + (end - cdsStart-1) >= cdsPos) {
433                    // we are reaching our target position
434                    int tmp = cdsPos - codingLength;
435
436
437                    logger.debug(cdsStart + " | " + codingLength + " | " + tmp);
438                    logger.debug(" -> found position in UTR exon:  #"+(i+1)+ " cdsPos:" + cdsPos +
439                            " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength);
440
441                    // we start 1 after cdsStart...
442                    return new ChromPos((cdsStart +1 + tmp),-1);
443                }
444                inCoding = true;
445                codingLength += (end - cdsStart);
446
447                logger.debug("     UTR         : " + format(start+1) + " - " + (cdsStart ));
448                logger.debug(" ->  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
449
450            } else if (start+1 <= cdsEnd && end >= cdsEnd) {
451                // LAST EXON with UTR
452                //logger.debug(" <-- CDS end at: " + cdsEnd );
453                inCoding = false;
454                if (codingLength + (cdsEnd - start-1) >= cdsPos) {
455                    int tmp = cdsPos - codingLength;
456
457                    logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
458                    logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
459                    logger.debug( codingLength + " | " + tmp + " | " + format(start+1));
460                    logger.debug(" -> chromPosForward found position in non coding exon:  " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength);
461
462                    return new ChromPos((start +1 + tmp),cdsPos%3);
463                }
464                codingLength += (cdsEnd - start-1);
465
466                logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
467                logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
468
469
470            } else if (inCoding) {
471                // A standard coding Exon
472                // tests for the maximum length of this coding exon
473                if (codingLength + (end - start -1)  >= cdsPos) {
474
475                    // we are within the range of this exon
476                    int tmp = cdsPos - codingLength ;
477
478                    logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength);
479                    logger.debug(" -> found chr position in coding exon #" + (i+1) + ":  cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength);
480
481                    return new ChromPos((start +1 + tmp),cdsPos%3);
482                }
483                // full exon is coding
484                codingLength += (end - start );
485
486                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
487            }
488        }
489        return new ChromPos(-1,-1);
490    }
491
492    /**
493     * Get the length of the coding sequence
494     *
495     * @param exonStarts
496     * @param exonEnds
497     * @param cdsStart
498     * @param cdsEnd
499     * @return
500     */
501    public static int getCDSLengthReverse(List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
502
503        int codingLength = 0;
504
505        if (cdsEnd < cdsStart) {
506            int tmp = cdsEnd;
507            cdsEnd = cdsStart;
508            cdsStart = tmp;
509        }
510        cdsStart = cdsStart + base;
511
512        // map reverse
513        for (int i = exonStarts.size() - 1; i >= 0; i--) {
514
515            int end = exonStarts.get(i);
516            int start = exonEnds.get(i);
517
518            if (end < start) {
519                int tmp = end;
520                end = start;
521                start = tmp;
522            }
523            start = start + base;
524
525            if ((start < cdsStart && end < cdsStart) || (start > cdsEnd && end > cdsEnd))
526                continue;
527
528            if (start < cdsStart)
529                start = cdsStart;
530
531            if (end > cdsEnd)
532                end = cdsEnd;
533
534            codingLength += (end - start + 1);
535        }
536        return codingLength - 3;
537    }
538
539    /**
540     * Get the length of the coding sequence
541     *
542     * @param exonStarts
543     * @param exonEnds
544     * @param cdsStart
545     * @param cdsEnd
546     * @return
547     */
548    public static int getCDSLengthForward(List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd) {
549
550        int codingLength = 0;
551
552        for (int i = 0; i < exonStarts.size(); i++) {
553
554            int start = exonStarts.get(i)+base;
555            int end = exonEnds.get(i);
556
557            if ( (start < cdsStart+base && end < cdsStart) || (start > cdsEnd && end > cdsEnd) )
558                continue;
559
560            if (start < cdsStart+base)
561                start = cdsStart+base;
562
563            if (end > cdsEnd)
564                end = cdsEnd;
565
566            codingLength += (end - start + 1);
567        }
568        return codingLength - 3;
569    }
570
571    /** 
572     * Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions)
573     *
574     * @param chromPos
575     * @return
576     */
577    public static List<Range<Integer>> getCDSExonRanges(GeneChromosomePosition chromPos){
578        if ( chromPos.getOrientation() == '+')
579            return getCDSExonRangesForward(chromPos,CDS);
580        return getCDSExonRangesReverse(chromPos,CDS);
581    }
582
583    /** Extracts the boundaries of the coding regions in chromosomal coordinates
584     *
585     * @param chromPos
586     * @return
587     */
588    public static List<Range<Integer>> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){
589        if ( chromPos.getOrientation() == '+')
590            return getCDSExonRangesForward(chromPos,CHROMOSOME);
591        return getCDSExonRangesReverse(chromPos,CHROMOSOME);
592    }
593
594    private static List<Range<Integer>> getCDSExonRangesReverse(GeneChromosomePosition chromPos, String responseType) {
595
596        List<Integer> exonStarts = chromPos.getExonStarts();
597        List<Integer> exonEnds   = chromPos.getExonEnds();
598
599        List<Range<Integer>> data = new ArrayList<>();
600        int cdsStart = chromPos.getCdsStart();
601        int cdsEnd   = chromPos.getCdsEnd();
602
603        boolean inCoding = false;
604        int codingLength = 0;
605
606        if (cdsEnd < cdsStart) {
607            int tmp = cdsEnd;
608            cdsEnd = cdsStart;
609            cdsStart = tmp;
610        }
611
612        java.lang.StringBuffer s =null;
613
614        boolean debug = logger.isDebugEnabled();
615
616        if ( debug)
617            s = new StringBuffer();
618
619                //int lengthExons = 0;
620
621        // map reverse
622        for (int i = exonStarts.size() - 1; i >= 0; i--) {
623
624            int end = exonStarts.get(i);
625            int start = exonEnds.get(i);
626
627            if (end < start) {
628                int tmp = end;
629                end = start;
630                start = tmp;
631            }
632            //lengthExons += end - start;
633            //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start));
634            //s.append(newline);
635
636            if (start <= cdsEnd && end >= cdsEnd) {
637                inCoding = true;
638
639                int tmpstart = start;
640                if (start < cdsStart) {
641                    tmpstart = cdsStart;
642                }
643                codingLength += (cdsEnd - tmpstart);
644                if ( debug ) {
645                    s.append("     UTR         :").append(format(cdsEnd + 1)).append(" | ").append(format(end));
646                    s.append(newline);
647                    if (tmpstart == start)
648                        s.append(" ->  ");
649                    else
650                        s.append(" <-> ");
651                    s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3);
652                    s.append(newline);
653                    // single exon with UTR on both ends
654                    if (tmpstart != start)
655                        s.append("     UTR         :").append(format(cdsStart)).append(" - ").append(format(start + 1));
656                    s.append(newline);
657                }
658
659
660                Range<Integer> r ;
661                if ( responseType.equals(CDS))
662                    r = Range.closed(0,codingLength);
663                else
664                    r = Range.closed(tmpstart,cdsEnd);
665
666                data.add(r);
667
668            } else if (start <= cdsStart && end >= cdsStart) {
669                inCoding = false;
670
671                Range<Integer> r;
672                if ( responseType.equals(CDS))
673                    r = Range.closed(codingLength,codingLength+(end-cdsStart));
674                else
675                    r = Range.closed(cdsStart+1,end);
676
677                data.add(r);
678
679                codingLength += (end - cdsStart);
680                if  (debug) {
681                    s.append(" <-  Exon        : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
682                    s.append(newline);
683                    s.append("     UTR         : ").append(format(start + 1)).append(" - ").append(format(cdsStart));
684                    s.append(newline);
685                }
686            } else if (inCoding) {
687                // full exon is coding
688                Range<Integer> r;
689                if ( responseType.equals(CDS))
690                    r = Range.closed(codingLength,codingLength+(end-start));
691                else
692                    r = Range.closed(start,end);
693                data.add(r);
694
695                codingLength += (end - start);
696                if  (debug) {
697                    s.append("     Exon        : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
698                    s.append(newline);
699                }
700            } else {
701                // e.g. see UBQLN3
702                if ( debug ) {
703                    s.append(" no translation! UTR: " + format(start) + " - " + format(end));
704                    s.append(newline);
705                }
706            }
707        }
708        if ( debug ) {
709            s.append("CDS length: ").append(Integer.toString(codingLength - 3));
710            s.append(newline);
711            logger.debug(s.toString());
712        }
713
714        return data;
715    }
716
717    private static List<Range<Integer>> getCDSExonRangesForward(GeneChromosomePosition chromPos, String responseType) {
718
719        List<Range<Integer>> data = new ArrayList<>();
720        List<Integer> exonStarts = chromPos.getExonStarts();
721        List<Integer> exonEnds   = chromPos.getExonEnds();
722
723        int cdsStart = chromPos.getCdsStart();
724        int cdsEnd   = chromPos.getCdsEnd();
725
726        boolean inCoding = false;
727        int codingLength = 0;
728
729        for (int i = 0; i < exonStarts.size(); i++) {
730
731            int start = exonStarts.get(i);
732            int end = exonEnds.get(i);
733
734            if (start <= cdsStart && end >= cdsStart) {
735
736                inCoding = true;
737                codingLength += (end - cdsStart);
738
739                Range<Integer> r;
740                if ( responseType.equals(CDS))
741                    r = Range.closed(0,codingLength);
742                else
743                    r = Range.closed(cdsStart,end);
744                data.add(r);
745
746            } else if (start <= cdsEnd && end >= cdsEnd) {
747                //logger.debug(" <-- CDS end at: " + cdsEnd );
748                inCoding = false;
749
750                Range<Integer> r;
751                if ( responseType.equals(CDS))
752                    r = Range.closed(codingLength,codingLength+(cdsEnd-start));
753                else
754                    r = Range.closed(start,cdsEnd);
755                data.add(r);
756                codingLength += (cdsEnd - start);
757
758            } else if (inCoding) {
759                // full exon is coding
760                Range<Integer> r;
761                if ( responseType.equals(CDS))
762                    r = Range.closed(codingLength,codingLength+(end-start));
763                else
764                    r = Range.closed(start,end);
765                data.add(r);
766                codingLength += (end - start);
767            }
768        }
769        return data;
770    }
771
772    /**
773     * I have a genomic coordinate, where is it on the mRNA
774     *
775     * @param coordinate
776     * @param chromosomePosition
777     * @return
778     */
779    public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) {
780
781        if ( chromosomePosition.getOrientation() == '+')
782                return getCDSPosForward(coordinate,
783                    chromosomePosition.getExonStarts(),
784                    chromosomePosition.getExonEnds(),
785                    chromosomePosition.getCdsStart(),
786                    chromosomePosition.getCdsEnd());
787
788        return getCDSPosReverse(coordinate,
789                chromosomePosition.getExonStarts(),
790                chromosomePosition.getExonEnds(),
791                chromosomePosition.getCdsStart(),
792                chromosomePosition.getCdsEnd());
793    }
794    
795        /** 
796         * Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene 
797         * living on the forward DNA strand.
798         * 
799         * @param chromPos The genetic coordinate on a chromosome 
800     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)  
801     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
802     * @param cdsStart The start position of a coding region
803     * @param cdsEnd The end position of a coding region
804     * 
805     * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
806         * 
807         * @author Yana Valasatava
808         */
809    public static int getCDSPosForward(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds,
810            int cdsStart, int cdsEnd) {
811        
812        // the genetic coordinate is not in a coding region
813        if ( (chromPos < (cdsStart+base) ) || ( chromPos > (cdsEnd+base) ) ) {
814                logger.debug("The "+format(chromPos)+" position is not in a coding region");
815            return -1;
816        }
817        
818        logger.debug("looking for CDS position for " +format(chromPos));
819        
820        // map the genetic coordinates of coding region on a stretch of a reverse strand
821        List<Range<Integer>> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
822        
823        int codingLength = 0;
824        int lengthExon = 0;
825        for (Range<Integer> range : cdsRegions) {
826                
827                    int start = range.lowerEndpoint();
828                    int end = range.upperEndpoint();
829                    
830                    lengthExon = end - start;
831
832                    if (start+base <= chromPos && end >= chromPos ) {
833                        return codingLength + (chromPos-start);
834                    }
835                else { 
836                        codingLength += lengthExon;
837                }
838        }
839        return -1;
840    }
841    
842        /** 
843         * Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene 
844         * living on the reverse DNA strand.
845         * 
846         * @param chromPos The genetic coordinate on a chromosome 
847     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)  
848     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
849     * @param cdsStart The start position of a coding region
850     * @param cdsEnd The end position of a coding region
851     * 
852     * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
853         * 
854         * @author Yana Valasatava
855         */
856    public static int getCDSPosReverse(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds,
857            int cdsStart, int cdsEnd) {
858        
859        // the genetic coordinate is not in a coding region
860        if ( (chromPos < (cdsStart+base)) || ( chromPos > (cdsEnd+base) ) ) {
861                logger.debug("The "+format(chromPos)+" position is not in a coding region");
862            return -1;
863        }
864        
865        logger.debug("looking for CDS position for " +format(chromPos));
866                
867        // map the genetic coordinate on a stretch of a reverse strand
868        List<Range<Integer>> cdsRegions = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
869        
870        int codingLength = 0;
871        int lengthExon = 0;
872        for ( int i=cdsRegions.size()-1; i>=0; i-- ) {
873                
874                    int start = cdsRegions.get(i).lowerEndpoint();
875                    int end = cdsRegions.get(i).upperEndpoint();
876                    
877                    lengthExon = end - start;
878                    // +1 offset to be a base 1
879                    if (start+base <= chromPos && end >= chromPos ) {
880                        return codingLength + (end-chromPos+1);
881                    }
882                else { 
883                        codingLength += lengthExon;
884                }
885        }
886        return -1;
887    }
888    
889    /** 
890     * Extracts the exons boundaries in CDS coordinates corresponding to the forward DNA strand.
891     *
892     * @param origExonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
893     * @param origExonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
894     * @param cdsStart The start position of a coding region
895     * @param cdsEnd The end position of a coding region
896     * 
897     * @return the list of genetic positions corresponding to the exons boundaries in CDS coordinates
898     */
899    public static List<Range<Integer>> getCDSRegions(List<Integer> origExonStarts, List<Integer> origExonEnds, int cdsStart, int cdsEnd) {
900        
901        // remove exons that are fully landed in UTRs
902        List<Integer> exonStarts = new ArrayList<Integer>(origExonStarts);
903        List<Integer> exonEnds = new ArrayList<Integer>(origExonEnds);
904        
905        int j=0;
906        for (int i = 0; i < origExonStarts.size(); i++) {
907                if ( ( origExonEnds.get(i) < cdsStart) || ( origExonStarts.get(i) > cdsEnd) ) {
908                        exonStarts.remove(j);
909                        exonEnds.remove(j);
910                }
911                else {
912                        j++;
913                }
914        }
915        
916        // remove untranslated regions from exons
917        int nExons = exonStarts.size();
918        exonStarts.remove(0);
919        exonStarts.add(0, cdsStart);
920        exonEnds.remove(nExons-1);
921        exonEnds.add(cdsEnd);
922        
923        List<Range<Integer>> cdsRegion = new ArrayList<Range<Integer>>();
924        for ( int i=0; i<nExons; i++ ) {
925                Range<Integer> r = Range.closed(exonStarts.get(i), exonEnds.get(i));
926                cdsRegion.add(r);
927        }
928                return cdsRegion;
929    }
930    
931    /** 
932     * Extracts the DNA sequence transcribed from the input genetic coordinates.
933     *
934     * @param twoBitFacade the facade that provide an access to a 2bit file
935     * @param gcp The container with chromosomal positions
936     * 
937     * @return the DNA sequence transcribed from the input genetic coordinates
938     */
939    public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, GeneChromosomePosition gcp) throws Exception {
940        return getTranscriptDNASequence(twoBitFacade,gcp.getChromosome(),gcp.getExonStarts(), gcp.getExonEnds(), gcp.getCdsStart(), gcp.getCdsEnd(), gcp.getOrientation());
941    }
942    
943    /** 
944     * Extracts the DNA sequence transcribed from the input genetic coordinates.
945     *
946     * @param chromosome the name of the chromosome
947     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)  
948     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
949     * @param cdsStart The start position of a coding region
950     * @param cdsEnd The end position of a coding region
951     * @param orientation The orientation of the strand where the gene is living
952     * 
953     * @return the DNA sequence transcribed from the input genetic coordinates
954     */
955        public static DNASequence getTranscriptDNASequence(TwoBitFacade twoBitFacade, String chromosome, List<Integer> exonStarts, List<Integer> exonEnds, int cdsStart, int cdsEnd, Character orientation) throws Exception {
956
957                List<Range<Integer>> cdsRegion = getCDSRegions(exonStarts, exonEnds, cdsStart, cdsEnd);
958
959                String dnaSequence = "";
960                for (Range<Integer> range : cdsRegion) {
961                        String exonSequence = twoBitFacade.getSequence(chromosome,range.lowerEndpoint(), range.upperEndpoint());
962            dnaSequence += exonSequence;
963                }
964                if (orientation.equals('-')) {
965            dnaSequence = new StringBuilder(dnaSequence).reverse().toString();
966                        DNASequence dna = new DNASequence(dnaSequence);
967                        SequenceView<NucleotideCompound> compliment = dna.getComplement();
968            dnaSequence = compliment.getSequenceAsString();
969                }
970                return new DNASequence(dnaSequence.toUpperCase());
971        }
972}