001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.homologene;
023
024import java.io.BufferedReader;
025import java.io.FileNotFoundException;
026import java.io.FileReader;
027import java.io.IOException;
028import java.net.URL;
029import java.util.Iterator;
030import java.util.regex.Matcher;
031import java.util.regex.Pattern;
032
033import javax.naming.OperationNotSupportedException;
034
035/**
036 * Homologene is a NCBI dataset that curates sets
037 * of orthologues from the reference model ogranisms.
038 * <p>
039 * This class is a Collection of methods for handling
040 * data from the Homologene dataset.
041 *
042 * @author David Huen
043 */
044public class HomologeneTools
045{
046    /**
047     * get the Taxon corresponding to this Taxon ID
048     */
049    public static Taxon getTaxon(int taxonID)
050    {
051        // currently just does a linear search
052        for (Iterator taxaI = Taxon.taxa.iterator(); taxaI.hasNext(); ) {
053            Taxon curr = (Taxon) taxaI.next();
054
055            if (curr.getTaxonID() == taxonID) return curr;
056        }
057
058        return null;
059    }
060
061    /**
062     * add a Taxon
063     */
064    public static Taxon createTaxon(int taxonID, String description)
065        throws DuplicateTaxonException
066    {
067        // first check that the taxon des not exist
068        if (getTaxon(taxonID) != null) throw new DuplicateTaxonException();
069
070        Taxon newTaxon = new Taxon.TaxonStub(taxonID, description);
071
072        return newTaxon;
073    }
074
075    /**
076     * instantiate a HomologeneDB.
077     * <p>
078     * Currently, only file protocol support is available.
079     */
080    public static void instantiateDB(URL url, HomologeneBuilder builder)
081        throws OperationNotSupportedException, FileNotFoundException, IOException
082    {
083        boolean inDB = false;
084        boolean inGroup = false;
085
086
087        if (!url.getProtocol().equals("file"))
088            throw new OperationNotSupportedException();
089
090        // open the file
091        BufferedReader rdr = new BufferedReader(
092            new FileReader(url.getPath())
093            );
094
095        // the file may or may not have a ">" at the start
096
097        // read loop
098        Pattern titlePattern = Pattern.compile("TITLE\\s(\\d+)_(\\d+)=(\\S+)\\s(.*)");
099        Pattern orthoPattern = Pattern.compile("^(\\d+)\\s*\\|\\s*(\\d+)\\s*\\|([Bbc]{1})\\|(.*)\\|\\s*(\\d+)\\s*\\|(.*)\\|(.*)\\|\\s*(\\d+)\\s*\\|(.*)\\|(.*)");
100        String currLine;
101        while ((currLine = rdr.readLine()) != null) {
102
103            // parse current line
104            if (currLine.startsWith(">")) {
105                // start new group
106                if (!inDB) {
107                    builder.startDB();
108                    inDB = true;
109                }
110                if (inGroup) {
111                    builder.endGroup();
112                }
113                builder.startGroup(); inGroup = true;
114            }
115            else if (currLine.startsWith("TITLE")) {
116                try {
117                    // parse the line
118                    Matcher m = titlePattern.matcher(currLine);
119
120                    if (m.matches()) {
121                        if (m.groupCount() != 4) continue;
122
123                        // pick up the groups
124                        int taxonID = Integer.parseInt(m.group(1));
125                        String homologeneID = m.group(2);
126                        String title = m.group(4);
127
128                        builder.addTitle(taxonID, homologeneID.trim(), title.trim());
129                    }
130                }
131                catch (NumberFormatException nfe) {
132                    continue;
133                }
134            }
135            else {
136                // this is a orthology line
137                // but we can't be certain if it's trash so we defer
138                // doing startDB and startGroup
139                try {
140                    // parse the line
141                    Matcher m = orthoPattern.matcher(currLine);
142
143                    if (m.matches()) {
144
145                        // this is a orthology line
146                        if (!inDB) {
147                            builder.startDB();
148                            inDB = true;
149                        }
150                        if (!inGroup) {
151                            builder.startGroup();
152                            inGroup = true;
153                        }
154
155                        if (m.groupCount() != 10) continue;
156                        // pick up the groups
157                        String taxonID0 = m.group(1).trim();//System.out.println(taxonID0);
158                        String taxonID1 = m.group(2).trim();//System.out.println(taxonID1);
159                        String type = m.group(3).trim();//System.out.println(type);
160                        String locus0 = m.group(4).trim();
161                        String homoID0 = m.group(5).trim();
162                        String access0 = m.group(6).trim();
163                        String locus1 = m.group(7).trim(); 
164                        String homoID1 = m.group(8).trim();
165                        String access1 = m.group(9).trim();
166                        String finale = m.group(10).trim();//System.out.println(finale);
167
168                        // validate numeric formats
169                        Integer.parseInt(taxonID0);
170                        Integer.parseInt(taxonID1);
171
172                        // validate the similarity type before proceeding
173                        if (   (type.equals("B")) 
174                            || (type.equals("b"))
175                            || (type.equals("c")) ) {
176
177                            if (type.equals("B")) {
178
179                                // validate numeric format
180                                Double.parseDouble(finale);
181
182                                builder.startOrthoPair();
183                                builder.addOrthoPairProperty(HomologeneBuilder.PERCENTIDENTITY, finale);
184                                builder.addOrthoPairProperty(HomologeneBuilder.SIMILARITYTYPE, HomologeneBuilder.MULTIPLE);
185                            }
186                            else if (type.equals("b")) {
187
188                                // validate numeric format
189                                Integer.parseInt(finale);
190
191                                builder.startOrthoPair();
192                                builder.addOrthoPairProperty(HomologeneBuilder.PERCENTIDENTITY, finale);
193                                builder.addOrthoPairProperty(HomologeneBuilder.SIMILARITYTYPE, HomologeneBuilder.TWIN);
194                            }
195                            else if (type.equals("c")) {
196
197                                builder.startOrthoPair();
198                                builder.addOrthoPairProperty(HomologeneBuilder.SIMILARITYTYPE, HomologeneBuilder.CURATED);
199                                builder.addOrthoPairProperty(HomologeneBuilder.PERCENTIDENTITY, finale);
200                            }
201
202                            // add the orthologues
203                            builder.startOrthologue();
204                            builder.addOrthologueProperty(HomologeneBuilder.TAXONID, taxonID0);
205                            builder.addOrthologueProperty(HomologeneBuilder.LOCUSID, locus0);
206                            builder.addOrthologueProperty(HomologeneBuilder.HOMOID, homoID0);
207                            builder.addOrthologueProperty(HomologeneBuilder.ACCESSION, access0);
208                            builder.endOrthologue();
209
210                            builder.startOrthologue();
211                            builder.addOrthologueProperty(HomologeneBuilder.TAXONID, taxonID1);
212                            builder.addOrthologueProperty(HomologeneBuilder.LOCUSID, locus1);
213                            builder.addOrthologueProperty(HomologeneBuilder.HOMOID, homoID1);
214                            builder.addOrthologueProperty(HomologeneBuilder.ACCESSION, access1);
215                            builder.endOrthologue();
216
217                            builder.endOrthoPair();
218                        }
219                    }
220                }
221                catch (NumberFormatException nfe) {
222                    nfe.printStackTrace();
223                    builder.endOrthoPair();
224                    continue;
225                }
226            }
227        }
228
229        // EOF
230        if (inGroup) builder.endGroup();
231        if (inDB) builder.endDB();
232    }
233}
234