Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.phylo;
022
023/**
024 * This class provides static methods for the calculation of the percentage of
025 * identity between two aligned sequences.
026 * <p>
027 * Since 4.1.1 the methods for distance inference in forester are also used in
028 * BioJava, so this implementation of percentage of identity is not needed
029 * anymore. However, the code is maintained as the own BioJava implementation.
030 *
031 * @author Scooter Willis
032 *
033 */
034public class Comparison {
035
036        private static final int caseShift = 'a' - 'A';
037
038        /**
039         * this is a gapped PID calculation
040         *
041         * @param s1
042         *            SequenceI
043         * @param s2
044         *            SequenceI
045         * @return float
046         */
047        public final static float PID(String seq1, String seq2) {
048                return PID(seq1, seq2, 0, seq1.length());
049        }
050
051        // Another pid with region specification
052        public final static float PID(String seq1, String seq2, int start, int end) {
053
054                int s1len = seq1.length();
055                int s2len = seq2.length();
056
057                int len = Math.min(s1len, s2len);
058
059                if (end < len) {
060                        len = end;
061                }
062
063                if (len < start) {
064                        start = len - 1; // we just use a single residue for the difference
065                }
066
067                int bad = 0;
068                char chr1;
069                char chr2;
070
071                for (int i = start; i < len; i++) {
072
073                        chr1 = seq1.charAt(i);
074                        chr2 = seq2.charAt(i);
075
076                        if ('a' <= chr1 && chr1 <= 'z') {
077                                // TO UPPERCASE !!!
078                                // Faster than toUpperCase
079                                chr1 -= caseShift;
080                        }
081                        if ('a' <= chr2 && chr2 <= 'z') {
082                                // TO UPPERCASE !!!
083                                // Faster than toUpperCase
084                                chr2 -= caseShift;
085                        }
086
087                        if (chr1 != chr2 && !isGap(chr1) && !isGap(chr2)) {
088                                bad++;
089                        }
090                }
091
092                return ((float) 100 * (len - bad)) / len;
093        }
094
095        /**
096         * Method that determines if a character means a gap in the alignment.
097         *
098         * @param c
099         *            gap character is one of the symbols in {' ','-','.'}
100         *
101         * @return true if it is a gap, false otherwise
102         */
103        public static final boolean isGap(char c) {
104                return (c == '-' || c == '.' || c == ' ') ? true : false;
105        }
106
107}