001package org.biojava.nbio.genome.util;
002
003import com.google.common.collect.Range;
004
005import org.biojava.nbio.genome.parsers.genename.ChromPos;
006import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
007import org.slf4j.Logger;
008import org.slf4j.LoggerFactory;
009
010
011import java.io.StringWriter;
012import java.util.ArrayList;
013import java.util.List;
014
015/**
016 *  A class that can map chromosomal positions to mRNA (coding sequence) positions.
017 *
018 *  @author Andreas Prlic
019 */
020
021public class ChromosomeMappingTools {
022
023    private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class);
024
025
026    private static final String newline = System.getProperty("line.separator");
027
028
029    public static final String CHROMOSOME = "CHROMOSOME";
030    public static final String CDS = "CDS";
031
032
033    /** Pretty print the details of a GeneChromosomePosition to a String
034     *
035     * @param chromosomePosition
036     * @return
037     */
038    public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){
039        if ( chromosomePosition.getOrientation() == '+')
040            return formatExonStructureForward(chromosomePosition);
041
042        return formatExonStructureReverse(chromosomePosition);
043
044    }
045
046
047    private static String formatExonStructureForward(GeneChromosomePosition chromPos) {
048
049        StringWriter s = new StringWriter();
050
051        List<Integer> exonStarts = chromPos.getExonStarts();
052        List<Integer> exonEnds   = chromPos.getExonEnds();
053
054        int cdsStart = chromPos.getCdsStart();
055        int cdsEnd   = chromPos.getCdsEnd();
056
057        boolean inCoding = false;
058        int codingLength = 0;
059
060        for (int i = 0; i < exonStarts.size(); i++) {
061
062            int start = exonStarts.get(i);
063            int end = exonEnds.get(i);
064
065            if (start <= cdsStart +1 && end >= cdsStart+1) {
066
067                inCoding = true;
068                codingLength += (end - cdsStart);
069                s.append("     UTR         : ").append(format(start)).append(" - ").append(format(cdsStart));
070                s.append(newline);
071                s.append(" ->  Exon        : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
072                s.append(newline);
073
074            } else if (start <= cdsEnd && end >= cdsEnd) {
075                //logger.debug(" <-- CDS end at: " + cdsEnd );
076                inCoding = false;
077                codingLength += (cdsEnd - start);
078
079                s.append(" <-  Exon        : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
080                s.append(newline);
081                s.append("     UTR         : " + (cdsEnd +1) + " - " + format(end));
082                s.append(newline);
083
084
085            } else if (inCoding) {
086                // full exon is coding
087                codingLength += (end - start);
088
089                s.append("     Exon        : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
090                s.append(newline);
091            }
092
093        }
094        s.append("Coding Length: ");
095        s.append((codingLength-3)+"");
096        s.append(newline);
097        return s.toString();
098    }
099
100
101
102    private static String showGenePosLink(GeneChromosomePosition chromPos, Integer pos ) {
103
104        String spos = format(pos);
105
106        StringBuffer buf = new StringBuffer();
107        buf.append("<a href=\"/pdb/gene/");
108        buf.append(chromPos.getGeneName());
109        buf.append("?chromosome=");
110        buf.append(chromPos.getChromosome());
111        buf.append("&range=");
112        buf.append(pos);
113        buf.append("\">");
114        buf.append(spos);
115        buf.append("</a>");
116
117        return buf.toString();
118    }
119
120
121    private static String formatExonStructureReverse(GeneChromosomePosition chromPos) {
122        StringWriter s = new StringWriter();
123
124        List<Integer> exonStarts = chromPos.getExonStarts();
125        List<Integer> exonEnds   = chromPos.getExonEnds();
126
127
128        int cdsStart = chromPos.getCdsStart();
129        int cdsEnd   = chromPos.getCdsEnd();
130
131        // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd));
132
133        boolean inCoding = false;
134        int codingLength = 0;
135
136        if (cdsEnd < cdsStart) {
137            int tmp = cdsEnd;
138            cdsEnd = cdsStart;
139            cdsStart = tmp;
140        }
141
142        // map reverse
143        for (int i = exonStarts.size() - 1; i >= 0; i--) {
144
145            int end = exonStarts.get(i);
146            int start = exonEnds.get(i);
147
148            if (end < start) {
149                int tmp = end;
150                end = start;
151                start = tmp;
152            }
153
154            if (start <= cdsEnd && end >= cdsEnd) {
155                inCoding = true;
156
157                int tmpstart = start;
158                if (start < cdsStart) {
159                    tmpstart = cdsStart;
160                }
161                codingLength += (cdsEnd - tmpstart);
162
163                s.append("     UTR         :" + format(cdsEnd + 1) + " | " + format(end));
164                s.append(newline);
165                if (tmpstart == start)
166                    s.append(" ->  ");
167                else
168                    s.append(" <-> ");
169                s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3));
170                s.append(newline);
171                // single exon with UTR on both ends
172                if (tmpstart != start)
173                    s.append("     UTR         :" + format(cdsStart ) + " - " + format(start + 1));
174                s.append(newline);
175
176            } else if (start <= cdsStart && end >= cdsStart) {
177                inCoding = false;
178                codingLength += (end - cdsStart);
179
180                s.append(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
181                s.append(newline);
182                s.append("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
183                s.append(newline);
184
185
186            } else if (inCoding) {
187                // full exon is coding
188                codingLength += (end - start);
189
190                s.append("     Exon        : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
191                s.append(newline);
192            } else {
193                // e.g. see UBQLN3
194                s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end));
195                s.append(newline);
196            }
197        }
198
199        s.append("CDS length: ").append(Integer.toString(codingLength - 3));
200        s.append(newline);
201
202        return s.toString();
203    }
204
205    /**
206     * Get the length of the CDS in nucleotides.
207     *
208     * @param chromPos
209     * @return length of the CDS in nucleotides.
210     */
211    public static int getCDSLength(GeneChromosomePosition chromPos) {
212
213        logger.debug(chromPos.toString());
214
215        logger.debug("chromosomal information: ");
216
217        logger.debug("Gene:" + chromPos.getGeneName());
218        logger.debug("  Transcription (including UTRs): " + chromPos.getTranscriptionStart() + " - " + chromPos.getTranscriptionEnd() + " (length:" + (chromPos.getTranscriptionEnd() - chromPos.getTranscriptionStart()) + ")");
219        logger.debug("  Orientation: " + chromPos.getOrientation());
220        logger.debug("  CDS: " + (chromPos.getCdsStart()) + " - " + chromPos.getCdsEnd() + " (length: " + (chromPos.getCdsEnd() - chromPos.getCdsStart()) + ")");
221
222
223        List<Integer> exonStarts = chromPos.getExonStarts();
224        List<Integer> exonEnds = chromPos.getExonEnds();
225
226        logger.debug("Exons:" + exonStarts.size());
227
228        int cdsStart = chromPos.getCdsStart();
229        int cdsEnd = chromPos.getCdsEnd();
230
231
232        int codingLength;
233        if (chromPos.getOrientation().equals('+'))
234            codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd);
235        else
236            codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd);
237        return codingLength;
238    }
239
240
241    /**
242     * maps the position of a CDS nucleotide back to the genome
243     *
244     * @param cdsNucleotidePosition
245     * @return a ChromPos object
246     */
247    public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) {
248
249        logger.debug(" ? Checking chromosome position for CDS position " + cdsNucleotidePosition);
250
251        List<Integer> exonStarts = chromPos.getExonStarts();
252        List<Integer> exonEnds = chromPos.getExonEnds();
253
254        logger.debug(" Exons:" + exonStarts.size());
255
256        int cdsStart = chromPos.getCdsStart();
257        int cdsEnd = chromPos.getCdsEnd();
258
259
260        ChromPos chromosomePos = null;
261
262        if (chromPos.getOrientation().equals('+'))
263
264            chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
265        else
266            chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd);
267
268        logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at  " + chromosomePos);
269        return chromosomePos;
270
271    }
272
273    /** returns a nicely formatted representation of the position
274     *
275     * @param chromosomePosition
276     * @return
277     */
278    private static String format(int chromosomePosition){
279
280
281        return String.format("%,d", chromosomePosition);
282    }
283
284    /**
285     * Get the CDS position mapped on the chromosome position
286     *
287     * @param exonStarts
288     * @param exonEnds
289     * @param cdsStart
290     * @param cdsEnd
291     * @return
292     */
293    public static ChromPos getChromPosReverse(int cdsPos, List<Integer> exonStarts,
294                                              List<Integer> exonEnds, int cdsStart, int cdsEnd) {
295
296        boolean inCoding = false;
297        int codingLength = 0;
298
299        if (cdsEnd < cdsStart) {
300            int tmp = cdsEnd;
301            cdsEnd = cdsStart;
302            cdsStart = tmp;
303        }
304
305        int lengthExons = 0;
306
307        // map reverse
308        for (int i = exonStarts.size() - 1; i >= 0; i--) {
309
310            logger.debug("Exon #" + (i+1) + "/" + exonStarts.size());
311            int end = exonStarts.get(i);
312            int start = exonEnds.get(i);
313
314            if (end < start) {
315                int tmp = end;
316                end = start;
317                start = tmp;
318            }
319            lengthExons += end - start;
320
321            logger.debug("     is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1));
322            logger.debug("     CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength);
323
324            if (start+1 <= cdsEnd && end >= cdsEnd) {
325
326
327                // FIRST EXON
328                inCoding = true;
329
330
331                int tmpstart = start;
332                if (start < cdsStart) {
333                    tmpstart = cdsStart;
334                }
335
336                // here one of the few places where we don't say start+1
337                int check = codingLength + cdsEnd - tmpstart ;
338
339                logger.debug("First Exon    | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos );
340
341
342                if ( ( check > cdsPos)  )
343                {
344
345                    int tmp = cdsPos - codingLength ;
346
347
348                    logger.debug(" -> found position in UTR exon:  " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength);
349
350                    return new ChromPos((cdsEnd - tmp), -1) ;
351                }
352
353
354                // don't add 1 here
355                codingLength += (cdsEnd - tmpstart );
356
357                boolean debug = logger.isDebugEnabled();
358
359
360                if ( debug ) {
361
362                    StringBuffer b = new StringBuffer();
363
364                    b.append("     UTR         :" + format(cdsEnd + 1) + " - " + format(end) + newline);
365                    if (tmpstart == start)
366                        b.append(" ->  ");
367                    else
368                        b.append(" <-> ");
369                    b.append("Exon        :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline);
370
371                    // single exon with UTR on both ends
372                    if (tmpstart != start)
373                        b.append("     UTR         :" + format(cdsStart) + " - " + format(start + 1) + newline);
374
375                    logger.debug(b.toString());
376                }
377            } else if (start <= cdsStart && end >= cdsStart) {
378
379                // LAST EXON
380                inCoding = false;
381
382                if (codingLength + end - cdsStart >= cdsPos) {
383
384                    // how many remaining coding nucleotides?
385                    int tmp = codingLength + end - cdsStart - cdsPos ;
386
387                    logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart));
388
389                    logger.debug(" -> XXX found position noncoding exon:  cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd));
390
391                    return new ChromPos((cdsStart + tmp),-1);
392                }
393
394                codingLength += (end - cdsStart);
395
396                logger.debug(" <-  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
397                logger.debug("     UTR         : " + format(start+1) + " - " + format(cdsStart ));
398
399            } else if (inCoding) {
400
401
402                if (codingLength + end - start -1  >= cdsPos) {
403
404                    int tmp = cdsPos - codingLength ;
405
406                    if ( tmp > (end - start ) ) {
407
408                        tmp = (end - start );
409
410                        logger.debug("changing tmp to " + tmp);
411
412                    }
413
414                    logger.debug("     " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp);
415                    logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
416                    logger.debug(" ->  RRR found position coding exon:  " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength);
417
418                    return new ChromPos((end - tmp),cdsPos %3);
419                }
420
421                // full exon is coding
422                codingLength += (end - start) ;
423
424                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
425            } else {
426                // e.g. see UBQLN3
427
428                logger.debug(" no translation!");
429            }
430
431
432            logger.debug("     coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos);
433
434
435        }
436
437        logger.debug("length exons: " + lengthExons);
438        // could not map, or map over the full length??
439        return new ChromPos(-1,-1);
440
441
442    }
443
444    /**
445     * Get the CDS position mapped onto the chromosome position
446     *
447     * @param exonStarts
448     * @param exonEnds
449     * @param cdsStart
450     * @param cdsEnd
451     * @return
452     */
453    public static ChromPos getChromPosForward(int cdsPos, List<Integer> exonStarts, List<Integer> exonEnds,
454                                              int cdsStart, int cdsEnd) {
455        boolean inCoding = false;
456        int codingLength = 0;
457
458        int lengthExons = 0;
459        // map forward
460        for (int i = 0; i < exonStarts.size(); i++) {
461
462
463            // start can include UTR
464            int start = exonStarts.get(i);
465            int end = exonEnds.get(i);
466
467            lengthExons += end - start;
468
469
470            if (start <= cdsStart +1 && end >= cdsStart+1) {
471                // first exon with UTR
472                if (codingLength + (end - cdsStart-1) >= cdsPos) {
473                    // we are reaching our target position
474                    int tmp = cdsPos - codingLength;
475
476
477                    logger.debug(cdsStart + " | " + codingLength + " | " + tmp);
478                    logger.debug(" -> found position in UTR exon:  #"+(i+1)+ " cdsPos:" + cdsPos +
479                            " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength);
480
481                    // we start 1 after cdsStart...
482                    return new ChromPos((cdsStart +1 + tmp),-1);
483                }
484                inCoding = true;
485                codingLength += (end - cdsStart);
486
487                logger.debug("     UTR         : " + format(start+1) + " - " + (cdsStart ));
488                logger.debug(" ->  Exon        : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
489
490            } else if (start+1 <= cdsEnd && end >= cdsEnd) {
491                // LAST EXON with UTR
492                //logger.debug(" <-- CDS end at: " + cdsEnd );
493                inCoding = false;
494                if (codingLength + (cdsEnd - start-1) >= cdsPos) {
495                    int tmp = cdsPos - codingLength;
496
497
498                    logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
499                    logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
500                    logger.debug( codingLength + " | " + tmp + " | " + format(start+1));
501                    logger.debug(" -> chromPosForward found position in non coding exon:  " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength);
502
503                    return new ChromPos((start +1 + tmp),cdsPos%3);
504                }
505                codingLength += (cdsEnd - start-1);
506
507                logger.debug(" <-  Exon        : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3));
508                logger.debug("     UTR         : " + format(cdsEnd + 1) + " - " + format(end));
509
510
511            } else if (inCoding) {
512                // A standard coding Exon
513                // tests for the maximum length of this coding exon
514                if (codingLength + (end - start -1)  >= cdsPos) {
515
516                    // we are within the range of this exon
517                    int tmp = cdsPos - codingLength ;
518
519
520                    logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength);
521                    logger.debug(" -> found chr position in coding exon #" + (i+1) + ":  cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength);
522
523                    return new ChromPos((start +1 + tmp),cdsPos%3);
524                }
525                // full exon is coding
526                codingLength += (end - start );
527
528                logger.debug("     Exon        : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3));
529            }
530            //                  if ( inCoding )
531            //                          logger.debug("exon phase at end:" + (codingLength % 3));
532            //
533            //                  logger.debug("   coding length: " + codingLength);
534
535
536        }
537
538        //logger.debug("length exons: " + lengthExons);
539        //return codingLength - 3;
540
541        // could not map!
542
543        return new ChromPos(-1,-1);
544    }
545
546
547    /**
548     * Get the length of the coding sequence
549     *
550     * @param exonStarts
551     * @param exonEnds
552     * @param cdsStart
553     * @param cdsEnd
554     * @return
555     */
556    public static int getCDSLengthReverse(List<Integer> exonStarts,
557                                          List<Integer> exonEnds, int cdsStart, int cdsEnd) {
558
559        boolean inCoding = false;
560        int codingLength = 0;
561
562        if (cdsEnd < cdsStart) {
563            int tmp = cdsEnd;
564            cdsEnd = cdsStart;
565            cdsStart = tmp;
566        }
567
568        int lengthExons = 0;
569
570        // map reverse
571        for (int i = exonStarts.size() - 1; i >= 0; i--) {
572
573            int end = exonStarts.get(i);
574            int start = exonEnds.get(i);
575
576            if (end < start) {
577                int tmp = end;
578                end = start;
579                start = tmp;
580            }
581            lengthExons += end - start;
582
583            if (start <= cdsEnd && end >= cdsEnd) {
584                inCoding = true;
585
586
587                int tmpstart = start;
588                if (start < cdsStart) {
589                    tmpstart = cdsStart;
590                }
591                codingLength += (cdsEnd - tmpstart);
592
593                boolean debug = logger.isDebugEnabled();
594
595                if ( debug) {
596
597                    StringBuffer b = new StringBuffer();
598
599                    b.append("     UTR         :" + (cdsEnd + 1) + " - " + (end) + newline);
600                    if (tmpstart == start)
601                        b.append(" ->  ");
602                    else
603                        b.append(" <-> ");
604                    b.append("Exon        :" + tmpstart + " - " + cdsEnd + " | " + (cdsEnd - tmpstart) + " | " + codingLength + " | " + (codingLength % 3) + newline);
605                    // single exon with UTR on both ends
606                    if (tmpstart != start)
607                        b.append("     UTR         :" + (cdsStart - 1) + " - " + start  + newline);
608                    logger.debug(b.toString());
609
610                }
611            } else if (start <= cdsStart && end >= cdsStart) {
612                inCoding = false;
613                codingLength += (end - cdsStart);
614
615
616                logger.debug(" <-  Exon        : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart) + " | " + (codingLength-3)  + " | " + (codingLength % 3));
617                logger.debug("     UTR         : " + start + " - " + (cdsStart ));
618
619
620
621            } else if (inCoding) {
622                // full exon is coding
623                codingLength += (end - start);
624
625                logger.debug("     Exon        : " + start + " - " + end + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
626            } else {
627                // e.g. see UBQLN3
628
629                logger.debug(" no translation!");
630            }
631
632        }
633
634        logger.debug("length exons: " + lengthExons + " codin length: " + (codingLength - 3));
635        return codingLength - 3;
636    }
637
638    /**
639     * Get the length of the coding sequence
640     *
641     * @param exonStarts
642     * @param exonEnds
643     * @param cdsStart
644     * @param cdsEnd
645     * @return
646     */
647    public static int getCDSLengthForward(List<Integer> exonStarts, List<Integer> exonEnds,
648                                          int cdsStart, int cdsEnd) {
649        boolean inCoding = false;
650        int codingLength = 0;
651
652        int lengthExons = 0;
653        // map forward
654        for (int i = 0; i < exonStarts.size(); i++) {
655
656            int start = exonStarts.get(i);
657            int end = exonEnds.get(i);
658            lengthExons += end - start;
659
660
661            logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start));
662
663            if (start+1 <= cdsStart +1 && end >= cdsStart+1) {
664
665                inCoding = true;
666                codingLength += (end - cdsStart);
667
668                logger.debug("     UTR         : " + start + " - " + (cdsStart ));
669                logger.debug(" ->  Exon        : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3));
670
671            } else if (start+1 <= cdsEnd && end >= cdsEnd) {
672
673                inCoding = false;
674                codingLength += (cdsEnd - start);
675
676                logger.debug(" <-  Exon        : " + (start +1)+ " - " + cdsEnd + " | " + (cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3));
677                logger.debug("     UTR         : " + cdsEnd + 1 + " - " + end);
678
679
680            } else if (inCoding) {
681                // full exon is coding
682                codingLength += (end - start);
683
684                logger.debug("     Exon        :" + (start+1) + " - " + end + " | " + (end - start+1) + " | " + codingLength + " | " + (codingLength % 3));
685            }
686
687
688        }
689
690        logger.debug("length exons: " + Integer.toString(lengthExons));
691        logger.debug("CDS length:" + Integer.toString((codingLength-3)));
692
693        return codingLength-3 ;
694    }
695
696
697
698    /** Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions)
699     *
700     * @param chromPos
701     * @return
702     */
703    public static List<Range<Integer>> getCDSExonRanges(GeneChromosomePosition chromPos){
704        if ( chromPos.getOrientation() == '+')
705
706            return getCDSExonRangesForward(chromPos,CDS);
707
708        return getCDSExonRangesReverse(chromPos,CDS);
709    }
710
711
712    /** Extracts the boundaries of the coding regions in chromosomal coordinates
713     *
714     * @param chromPos
715     * @return
716     */
717    public static List<Range<Integer>> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){
718        if ( chromPos.getOrientation() == '+')
719            return getCDSExonRangesForward(chromPos,CHROMOSOME);
720
721        return getCDSExonRangesReverse(chromPos,CHROMOSOME);
722    }
723
724    private static List<Range<Integer>> getCDSExonRangesReverse(GeneChromosomePosition chromPos,
725                                                                String responseType) {
726
727        List<Integer> exonStarts = chromPos.getExonStarts();
728        List<Integer> exonEnds   = chromPos.getExonEnds();
729
730        List<Range<Integer>> data = new ArrayList<>();
731        int cdsStart = chromPos.getCdsStart();
732        int cdsEnd   = chromPos.getCdsEnd();
733
734        boolean inCoding = false;
735        int codingLength = 0;
736
737        if (cdsEnd < cdsStart) {
738            int tmp = cdsEnd;
739            cdsEnd = cdsStart;
740            cdsStart = tmp;
741        }
742
743        java.lang.StringBuffer s =null;
744
745        boolean debug = logger.isDebugEnabled();
746
747
748        if ( debug)
749            s = new StringBuffer();
750
751        int lengthExons = 0;
752
753        // map reverse
754        for (int i = exonStarts.size() - 1; i >= 0; i--) {
755
756            int end = exonStarts.get(i);
757            int start = exonEnds.get(i);
758
759            if (end < start) {
760                int tmp = end;
761                end = start;
762                start = tmp;
763            }
764            lengthExons += end - start;
765            //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start));
766            //s.append(newline);
767
768            if (start <= cdsEnd && end >= cdsEnd) {
769                inCoding = true;
770
771
772                int tmpstart = start;
773                if (start < cdsStart) {
774                    tmpstart = cdsStart;
775                }
776                codingLength += (cdsEnd - tmpstart);
777                if ( debug ) {
778                    s.append("     UTR         :").append(format(cdsEnd + 1)).append(" | ").append(format(end));
779                    s.append(newline);
780                    if (tmpstart == start)
781                        s.append(" ->  ");
782                    else
783                        s.append(" <-> ");
784                    s.append("Exon        :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3);
785                    s.append(newline);
786                    // single exon with UTR on both ends
787                    if (tmpstart != start)
788                        s.append("     UTR         :").append(format(cdsStart)).append(" - ").append(format(start + 1));
789                    s.append(newline);
790                }
791
792
793                Range r ;
794                if ( responseType.equals(CDS))
795                    r = Range.closed(0,codingLength);
796                else
797                    r = Range.closed(tmpstart,cdsEnd);
798
799                data.add(r);
800
801            } else if (start <= cdsStart && end >= cdsStart) {
802                inCoding = false;
803
804                Range r;
805                if ( responseType.equals(CDS))
806                    r = Range.closed(codingLength,codingLength+(end-cdsStart));
807                else
808                    r = Range.closed(cdsStart+1,end);
809
810                data.add(r);
811
812                codingLength += (end - cdsStart);
813                if  (debug) {
814                    s.append(" <-  Exon        : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3));
815                    s.append(newline);
816                    s.append("     UTR         : ").append(format(start + 1)).append(" - ").append(format(cdsStart));
817                    s.append(newline);
818                }
819
820
821            } else if (inCoding) {
822                // full exon is coding
823
824                Range r;
825                if ( responseType.equals(CDS))
826                    r = Range.closed(codingLength,codingLength+(end-start));
827                else
828                    r = Range.closed(start,end);
829                data.add(r);
830
831                codingLength += (end - start);
832                if  (debug) {
833                    s.append("     Exon        : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3));
834                    s.append(newline);
835                }
836            } else {
837                // e.g. see UBQLN3
838                if ( debug ) {
839                    s.append(" no translation! UTR: " + format(start) + " - " + format(end));
840                    s.append(newline);
841                }
842            }
843        }
844        if ( debug ) {
845            s.append("CDS length: ").append(Integer.toString(codingLength - 3));
846            s.append(newline);
847            logger.debug(s.toString());
848        }
849
850        return data;
851    }
852
853
854
855    private static List<Range<Integer>> getCDSExonRangesForward(GeneChromosomePosition chromPos,
856                                                                String responseType) {
857
858        List<Range<Integer>> data = new ArrayList<>();
859        List<Integer> exonStarts = chromPos.getExonStarts();
860        List<Integer> exonEnds   = chromPos.getExonEnds();
861
862
863        int cdsStart = chromPos.getCdsStart();
864        int cdsEnd   = chromPos.getCdsEnd();
865
866        boolean inCoding = false;
867        int codingLength = 0;
868
869        for (int i = 0; i < exonStarts.size(); i++) {
870
871            int start = exonStarts.get(i);
872            int end = exonEnds.get(i);
873
874            if (start <= cdsStart && end >= cdsStart) {
875
876                inCoding = true;
877                codingLength += (end - cdsStart);
878//
879
880
881                Range r;
882                if ( responseType.equals(CDS))
883                    r = Range.closed(0,codingLength);
884                else
885                    r = Range.closed(cdsStart,end);
886                data.add(r);
887
888            } else if (start <= cdsEnd && end >= cdsEnd) {
889                //logger.debug(" <-- CDS end at: " + cdsEnd );
890                inCoding = false;
891
892
893                Range r;
894                if ( responseType.equals(CDS))
895                    r = Range.closed(codingLength,codingLength+(cdsEnd-start));
896                else
897                    r = Range.closed(start,cdsEnd);
898                data.add(r);
899                codingLength += (cdsEnd - start);
900
901            } else if (inCoding) {
902                // full exon is coding
903
904                Range r;
905                if ( responseType.equals(CDS))
906                    r = Range.closed(codingLength,codingLength+(end-start));
907                else
908                    r = Range.closed(start,end);
909                data.add(r);
910                codingLength += (end - start);
911
912            }
913
914        }
915
916        return data;
917    }
918//
919
920    /**
921     * I have a genomic coordinate, where is it on the mRNA
922     *
923     * @param coordinate
924     * @param chromosomePosition
925     * @return
926     */
927    public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) {
928
929        if ( chromosomePosition.getOrientation() == '+')
930                return getCDSPosForward(coordinate,
931                    chromosomePosition.getExonStarts(),
932                    chromosomePosition.getExonEnds(),
933                    chromosomePosition.getCdsStart(),
934                    chromosomePosition.getCdsEnd());
935
936        return getCDSPosReverse(coordinate,
937                chromosomePosition.getExonStarts(),
938                chromosomePosition.getExonEnds(),
939                chromosomePosition.getCdsStart(),
940                chromosomePosition.getCdsEnd());
941    }
942
943        /** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
944         * living on the forward DNA strand.
945         *
946         * @param chromPos The genetic coordinate on a chromosome
947     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
948     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
949     * @param cdsStart The start position of a coding region
950     * @param cdsEnd The end position of a coding region
951     *
952     * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
953         *
954         * @author Yana Valasatava
955         */
956    public static int getCDSPosForward(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds,
957            int cdsStart, int cdsEnd) {
958
959        // the genetic coordinate is not in a coding region
960        if ( (chromPos < (cdsStart+1) ) || ( chromPos > (cdsEnd+1) ) ) {
961                logger.debug("The "+format(chromPos)+" position is not in a coding region");
962            return -1;
963        }
964
965        logger.debug("looking for CDS position for " +format(chromPos));
966
967        // remove exons that are fully landed in UTRs
968        List<Integer> tmpS = new ArrayList<Integer>(exonStarts);
969        List<Integer> tmpE = new ArrayList<Integer>(exonEnds);
970
971        int j=0;
972        for (int i = 0; i < tmpS.size(); i++) {
973                if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
974                        exonStarts.remove(j);
975                        exonEnds.remove(j);
976                }
977                else {
978                        j++;
979                }
980        }
981
982        // remove untranslated regions from exons
983        int nExons = exonStarts.size();
984        exonStarts.remove(0);
985        exonStarts.add(0, cdsStart);
986        exonEnds.remove(nExons-1);
987        exonEnds.add(cdsEnd);
988
989        int codingLength = 0;
990        int lengthExon = 0;
991
992        // map the genetic coordinate on a stretch of a reverse strand
993        for (int i = 0; i < nExons; i++) {
994
995                    int start = exonStarts.get(i);
996                    int end = exonEnds.get(i);
997
998                    lengthExon = end - start;
999
1000                    if (start+1 <= chromPos && end >= chromPos ) {
1001                        return codingLength + (chromPos-start);
1002                    }
1003                else {
1004                        codingLength += lengthExon;
1005                }
1006        }
1007        return -1;
1008    }
1009
1010        /** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene
1011         * living on the reverse DNA strand.
1012         *
1013         * @param chromPos The genetic coordinate on a chromosome
1014     * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
1015     * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
1016     * @param cdsStart The start position of a coding region
1017     * @param cdsEnd The end position of a coding region
1018     *
1019     * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1)
1020         *
1021         * @author Yana Valasatava
1022         */
1023    public static int getCDSPosReverse(int chromPos, List<Integer> exonStarts, List<Integer> exonEnds,
1024            int cdsStart, int cdsEnd) {
1025
1026        // the genetic coordinate is not in a coding region
1027        if ( (chromPos < (cdsStart+1)) || ( chromPos > (cdsEnd+1) ) ) {
1028                logger.debug("The "+format(chromPos)+" position is not in a coding region");
1029            return -1;
1030        }
1031
1032        logger.debug("looking for CDS position for " +format(chromPos));
1033
1034        // remove exons that are fully landed in UTRs
1035        List<Integer> tmpS = new ArrayList<Integer>(exonStarts);
1036        List<Integer> tmpE = new ArrayList<Integer>(exonEnds);
1037
1038        int j=0;
1039        for (int i = tmpS.size() - 1; i >= 0; i--) {
1040                if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
1041                        exonStarts.remove(j);
1042                        exonEnds.remove(j);
1043                }
1044                else {
1045                        j++;
1046                }
1047        }
1048
1049        // remove untranslated regions from exons
1050        int nExons = exonStarts.size();
1051        exonStarts.remove(0);
1052        exonStarts.add(0, cdsStart);
1053        exonEnds.remove(nExons-1);
1054        exonEnds.add(cdsEnd);
1055
1056        int codingLength = 0;
1057        int lengthExon = 0;
1058        
1059        // map the genetic coordinate on a stretch of a reverse strand
1060        for (int i = nExons - 1; i >= 0; i--) {
1061
1062                    int start = exonStarts.get(i);
1063                    int end = exonEnds.get(i);
1064
1065                    lengthExon = end - start;
1066                    // +1 offset to be a base 1
1067                    if (start+1 <= chromPos && end >= chromPos ) {
1068                        return codingLength + (end-chromPos+1);
1069                    }
1070                else {
1071                        codingLength += lengthExon;
1072                }
1073        }
1074        return -1;
1075    }
1076}