001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.homologene; 023 024import java.io.BufferedReader; 025import java.io.FileNotFoundException; 026import java.io.FileReader; 027import java.io.IOException; 028import java.net.URL; 029import java.util.Iterator; 030import java.util.regex.Matcher; 031import java.util.regex.Pattern; 032 033import javax.naming.OperationNotSupportedException; 034 035/** 036 * Homologene is a NCBI dataset that curates sets 037 * of orthologues from the reference model ogranisms. 038 * <p> 039 * This class is a Collection of methods for handling 040 * data from the Homologene dataset. 041 * 042 * @author David Huen 043 */ 044public class HomologeneTools 045{ 046 /** 047 * get the Taxon corresponding to this Taxon ID 048 */ 049 public static Taxon getTaxon(int taxonID) 050 { 051 // currently just does a linear search 052 for (Iterator taxaI = Taxon.taxa.iterator(); taxaI.hasNext(); ) { 053 Taxon curr = (Taxon) taxaI.next(); 054 055 if (curr.getTaxonID() == taxonID) return curr; 056 } 057 058 return null; 059 } 060 061 /** 062 * add a Taxon 063 */ 064 public static Taxon createTaxon(int taxonID, String description) 065 throws DuplicateTaxonException 066 { 067 // first check that the taxon des not exist 068 if (getTaxon(taxonID) != null) throw new DuplicateTaxonException(); 069 070 Taxon newTaxon = new Taxon.TaxonStub(taxonID, description); 071 072 return newTaxon; 073 } 074 075 /** 076 * instantiate a HomologeneDB. 077 * <p> 078 * Currently, only file protocol support is available. 079 */ 080 public static void instantiateDB(URL url, HomologeneBuilder builder) 081 throws OperationNotSupportedException, FileNotFoundException, IOException 082 { 083 boolean inDB = false; 084 boolean inGroup = false; 085 086 087 if (!url.getProtocol().equals("file")) 088 throw new OperationNotSupportedException(); 089 090 // open the file 091 BufferedReader rdr = new BufferedReader( 092 new FileReader(url.getPath()) 093 ); 094 095 // the file may or may not have a ">" at the start 096 097 // read loop 098 Pattern titlePattern = Pattern.compile("TITLE\\s(\\d+)_(\\d+)=(\\S+)\\s(.*)"); 099 Pattern orthoPattern = Pattern.compile("^(\\d+)\\s*\\|\\s*(\\d+)\\s*\\|([Bbc]{1})\\|(.*)\\|\\s*(\\d+)\\s*\\|(.*)\\|(.*)\\|\\s*(\\d+)\\s*\\|(.*)\\|(.*)"); 100 String currLine; 101 while ((currLine = rdr.readLine()) != null) { 102 103 // parse current line 104 if (currLine.startsWith(">")) { 105 // start new group 106 if (!inDB) { 107 builder.startDB(); 108 inDB = true; 109 } 110 if (inGroup) { 111 builder.endGroup(); 112 } 113 builder.startGroup(); inGroup = true; 114 } 115 else if (currLine.startsWith("TITLE")) { 116 try { 117 // parse the line 118 Matcher m = titlePattern.matcher(currLine); 119 120 if (m.matches()) { 121 if (m.groupCount() != 4) continue; 122 123 // pick up the groups 124 int taxonID = Integer.parseInt(m.group(1)); 125 String homologeneID = m.group(2); 126 String title = m.group(4); 127 128 builder.addTitle(taxonID, homologeneID.trim(), title.trim()); 129 } 130 } 131 catch (NumberFormatException nfe) { 132 continue; 133 } 134 } 135 else { 136 // this is a orthology line 137 // but we can't be certain if it's trash so we defer 138 // doing startDB and startGroup 139 try { 140 // parse the line 141 Matcher m = orthoPattern.matcher(currLine); 142 143 if (m.matches()) { 144 145 // this is a orthology line 146 if (!inDB) { 147 builder.startDB(); 148 inDB = true; 149 } 150 if (!inGroup) { 151 builder.startGroup(); 152 inGroup = true; 153 } 154 155 if (m.groupCount() != 10) continue; 156 // pick up the groups 157 String taxonID0 = m.group(1).trim();//System.out.println(taxonID0); 158 String taxonID1 = m.group(2).trim();//System.out.println(taxonID1); 159 String type = m.group(3).trim();//System.out.println(type); 160 String locus0 = m.group(4).trim(); 161 String homoID0 = m.group(5).trim(); 162 String access0 = m.group(6).trim(); 163 String locus1 = m.group(7).trim(); 164 String homoID1 = m.group(8).trim(); 165 String access1 = m.group(9).trim(); 166 String finale = m.group(10).trim();//System.out.println(finale); 167 168 // validate numeric formats 169 Integer.parseInt(taxonID0); 170 Integer.parseInt(taxonID1); 171 172 // validate the similarity type before proceeding 173 if ( (type.equals("B")) 174 || (type.equals("b")) 175 || (type.equals("c")) ) { 176 177 if (type.equals("B")) { 178 179 // validate numeric format 180 Double.parseDouble(finale); 181 182 builder.startOrthoPair(); 183 builder.addOrthoPairProperty(HomologeneBuilder.PERCENTIDENTITY, finale); 184 builder.addOrthoPairProperty(HomologeneBuilder.SIMILARITYTYPE, HomologeneBuilder.MULTIPLE); 185 } 186 else if (type.equals("b")) { 187 188 // validate numeric format 189 Integer.parseInt(finale); 190 191 builder.startOrthoPair(); 192 builder.addOrthoPairProperty(HomologeneBuilder.PERCENTIDENTITY, finale); 193 builder.addOrthoPairProperty(HomologeneBuilder.SIMILARITYTYPE, HomologeneBuilder.TWIN); 194 } 195 else if (type.equals("c")) { 196 197 builder.startOrthoPair(); 198 builder.addOrthoPairProperty(HomologeneBuilder.SIMILARITYTYPE, HomologeneBuilder.CURATED); 199 builder.addOrthoPairProperty(HomologeneBuilder.PERCENTIDENTITY, finale); 200 } 201 202 // add the orthologues 203 builder.startOrthologue(); 204 builder.addOrthologueProperty(HomologeneBuilder.TAXONID, taxonID0); 205 builder.addOrthologueProperty(HomologeneBuilder.LOCUSID, locus0); 206 builder.addOrthologueProperty(HomologeneBuilder.HOMOID, homoID0); 207 builder.addOrthologueProperty(HomologeneBuilder.ACCESSION, access0); 208 builder.endOrthologue(); 209 210 builder.startOrthologue(); 211 builder.addOrthologueProperty(HomologeneBuilder.TAXONID, taxonID1); 212 builder.addOrthologueProperty(HomologeneBuilder.LOCUSID, locus1); 213 builder.addOrthologueProperty(HomologeneBuilder.HOMOID, homoID1); 214 builder.addOrthologueProperty(HomologeneBuilder.ACCESSION, access1); 215 builder.endOrthologue(); 216 217 builder.endOrthoPair(); 218 } 219 } 220 } 221 catch (NumberFormatException nfe) { 222 nfe.printStackTrace(); 223 builder.endOrthoPair(); 224 continue; 225 } 226 } 227 } 228 229 // EOF 230 if (inGroup) builder.endGroup(); 231 if (inDB) builder.endDB(); 232 } 233} 234