001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.indexdb; 023 024import java.io.File; 025import java.io.FileNotFoundException; 026import java.io.FileReader; 027import java.io.IOException; 028import java.util.HashMap; 029import java.util.Map; 030 031import org.biojava.bio.BioException; 032import org.biojava.bio.program.tagvalue.ChangeTable; 033import org.biojava.bio.program.tagvalue.Indexer; 034import org.biojava.bio.program.tagvalue.LineSplitParser; 035import org.biojava.bio.program.tagvalue.Parser; 036import org.biojava.bio.program.tagvalue.ValueChanger; 037import org.biojava.bio.seq.io.SeqIOConstants; 038import org.biojava.utils.CommitFailure; 039import org.biojava.utils.ParserException; 040import org.biojava.utils.io.CountedBufferedReader; 041import org.biojava.utils.io.RAF; 042import org.biojava.utils.lsid.LifeScienceIdentifier; 043 044/** 045 * <code>IndexTools</code> contains static utility methods for 046 * creating flatfile indices according to the OBDA standard. 047 * 048 * @author Keith James 049 * @author Matthew Pocock 050 */ 051public class IndexTools 052{ 053 // Cannot be instantiated 054 private IndexTools() { } 055 056 /** 057 * <code>indexFasta</code> indexes DNA, RNA or protein Fasta 058 * format sequence files on primary identifier. 059 * 060 * @param location a <code>File</code> directory which will 061 * contain the indices. 062 * @param seqFiles a <code>File []</code> array of files to index. 063 * @param alphabetIdentifier an <code>int</code> indicating the 064 * type of sequence to be indexed. May be one of 065 * <code>SeqIOConstants.DNA SeqIOConstants.RNA 066 * SeqIOConstants.AA</code>. 067 * @param name a <code>String</code> arbitrary database name. 068 * 069 * @exception FileNotFoundException if an error occurs. 070 * @exception IOException if an error occurs. 071 * @exception ParserException if an error occurs. 072 * @exception BioException if an error occurs. 073 */ 074 public static void indexFasta(String name, File location, File [] seqFiles, 075 int alphabetIdentifier) 076 throws FileNotFoundException, IOException, ParserException, 077 BioException 078 { 079 BioStoreFactory bsf = new BioStoreFactory(); 080 bsf.setStoreName(name); 081 082 switch (alphabetIdentifier) 083 { 084 case (SeqIOConstants.DNA): 085 bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_DNA); 086 break; 087 case (SeqIOConstants.RNA): 088 bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_RNA); 089 break; 090 case (SeqIOConstants.AA): 091 bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_AA); 092 break; 093 094 default: 095 throw new IllegalArgumentException("Unknown alphabet identifier '" 096 + alphabetIdentifier 097 + "'"); 098 } 099 100 _indexFasta(bsf, location, seqFiles); 101 } 102 103 /** 104 * <code>indexEmbl</code> indexes DNA, RNA or protein EMBL format 105 * sequence files on ID as primary identifier and AC as secondary. 106 * 107 * @param location a <code>File</code> directory which will 108 * contain the indices. 109 * @param seqFiles a <code>File []</code> array of files to index. 110 * @param alphabetIdentifier an <code>int</code> indicating the 111 * type of sequence to be indexed. May be one of 112 * <code>SeqIOConstants.DNA SeqIOConstants.RNA 113 * SeqIOConstants.AA</code>. 114 * @param name a <code>String</code> arbitrary database name. 115 * 116 * @exception FileNotFoundException if an error occurs. 117 * @exception IOException if an error occurs. 118 * @exception ParserException if an error occurs. 119 * @exception BioException if an error occurs. 120 */ 121 public static void indexEmbl(String name, File location, File [] seqFiles, 122 int alphabetIdentifier) 123 throws FileNotFoundException, IOException, ParserException, 124 BioException 125 { 126 BioStoreFactory bsf = new BioStoreFactory(); 127 bsf.setStoreName(name); 128 129 switch (alphabetIdentifier) 130 { 131 case (SeqIOConstants.DNA): 132 bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_DNA); 133 break; 134 case (SeqIOConstants.RNA): 135 bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_RNA); 136 break; 137 case (SeqIOConstants.AA): 138 bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_AA); 139 break; 140 141 default: 142 throw new IllegalArgumentException("Unknown alphabet identifier '" 143 + alphabetIdentifier 144 + "'"); 145 } 146 147 _indexEmblLike(bsf, location, seqFiles); 148 } 149 150 /** 151 * <code>indexGenbank</code> indexes DNA, RNA or protein Genbank 152 * format sequence files on LOCUS as primary identifier and 153 * ACCESSION as secondary. 154 * 155 * @param location a <code>File</code> directory which will 156 * contain the indices. 157 * @param seqFiles a <code>File []</code> array of files to index. 158 * @param alphabetIdentifier an <code>int</code> indicating the 159 * type of sequence to be indexed. May be one of 160 * <code>SeqIOConstants.DNA SeqIOConstants.RNA 161 * SeqIOConstants.AA</code>. 162 * @param name a <code>String</code> arbitrary database name. 163 * 164 * @exception FileNotFoundException if an error occurs. 165 * @exception IOException if an error occurs. 166 * @exception ParserException if an error occurs. 167 * @exception BioException if an error occurs. 168 */ 169 public static void indexGenbank(String name, File location, File [] seqFiles, 170 int alphabetIdentifier) 171 throws FileNotFoundException, IOException, ParserException, 172 BioException 173 { 174 BioStoreFactory bsf = new BioStoreFactory(); 175 bsf.setStoreName(name); 176 177 switch (alphabetIdentifier) 178 { 179 case (SeqIOConstants.DNA): 180 bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_DNA); 181 break; 182 case (SeqIOConstants.RNA): 183 bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_RNA); 184 break; 185 case (SeqIOConstants.AA): 186 bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_AA); 187 break; 188 189 default: 190 throw new IllegalArgumentException("Unknown alphabet identifier '" 191 + alphabetIdentifier 192 + "'"); 193 } 194 195 _indexGenbank(bsf, location, seqFiles); 196 } 197 198 199 /** 200 * <code>indexSwissprot</code> indexes Swissprot format protein 201 * sequence files on ID as primary identifier. 202 * 203 * @param location a <code>File</code> directory which will 204 * contain the indices. 205 * @param seqFiles a <code>File []</code> array of files to index. 206 * @exception FileNotFoundException if an error occurs. 207 * @exception IOException if an error occurs. 208 * @exception ParserException if an error occurs. 209 * @exception BioException if an error occurs. 210 */ 211 public static void indexSwissprot(String name, File location, File [] seqFiles) 212 throws FileNotFoundException, IOException, ParserException, 213 BioException 214 { 215 BioStoreFactory bsf = new BioStoreFactory(); 216 bsf.setStoreName(name); 217 bsf.setSequenceFormat(LifeScienceIdentifier.valueOf("open-bio.org", 218 "swiss", 219 "protein" )); 220 _indexEmblLike(bsf, location, seqFiles); 221 } 222 223 private static void _indexFasta(BioStoreFactory bsf, 224 File location, File [] seqFiles) 225 throws FileNotFoundException, IOException, BioException 226 { 227 bsf.setPrimaryKey("ID"); 228 bsf.setStoreLocation(location); 229 bsf.addKey("ID", 10); 230 231 BioStore store = bsf.createBioStore(); 232 233 for (int i = 0; i < seqFiles.length; i++) 234 { 235 // File data 236 long newOffset = 0L; 237 long oldOffset = 0L; 238 RAF raf = new RAF(seqFiles[i], "r"); 239 Map map = new HashMap(); 240 241 CountedBufferedReader reader = 242 new CountedBufferedReader(new FileReader(raf.getFile())); 243 244 // Record data 245 String id = ""; 246 247 String line = null; 248 while ((line = reader.readLine()) != null) 249 { 250 if (line.startsWith(">")) 251 { 252 // Write at end of record 253 if (newOffset > 0) 254 { 255 store.writeRecord(raf, oldOffset, 256 (int) (newOffset - oldOffset), 257 id, map); 258 oldOffset = newOffset; 259 } 260 newOffset = reader.getFilePointer(); 261 262 int delimeter = line.indexOf(" "); 263 if (delimeter < 0) 264 id = line.substring(1); 265 else 266 id = line.substring(1, delimeter); 267 } 268 else 269 { 270 newOffset = reader.getFilePointer(); 271 } 272 } 273 274 // Write final record 275 store.writeRecord(raf, oldOffset, 276 (int) (newOffset - oldOffset), 277 id, map); 278 } 279 280 try 281 { 282 store.commit(); 283 } 284 catch (CommitFailure ne) 285 { 286 throw new BioException("Failed to commit new index to file",ne); 287 } 288 } 289 290 private static void _indexEmblLike(BioStoreFactory bsf, 291 File location, File [] seqFiles) 292 throws FileNotFoundException, IOException, ParserException, 293 BioException 294 { 295 bsf.setPrimaryKey("ID"); 296 bsf.setStoreLocation(location); 297 bsf.addKey("AC", 10); 298 bsf.addKey("ID", 10); 299 300 BioStore store = bsf.createBioStore(); 301 302 for (int i = 0; i < seqFiles.length; i++) 303 { 304 Indexer indexer = new Indexer(seqFiles[i], store); 305 indexer.setPrimaryKeyName("ID"); 306 indexer.addSecondaryKey("AC"); 307 308 ChangeTable changeTable = new ChangeTable(); 309 310 changeTable.setChanger("ID", new ChangeTable.Changer() 311 { 312 public Object change(Object value) 313 { 314 String s = (String) value; 315 int i = s.indexOf(" "); 316 317 if (i < 0) 318 return s; 319 else 320 return s.substring(0, i); 321 } 322 }); 323 324 changeTable.setChanger("AC", new ChangeTable.Changer() 325 { 326 public Object change(Object value) 327 { 328 String s = (String) value; 329 int i = s.indexOf(";"); 330 return s.substring(0, i); 331 } 332 }); 333 334 ValueChanger changer = new ValueChanger(indexer, changeTable); 335 Parser parser = new Parser(); 336 337 while(parser.read(indexer.getReader(), 338 LineSplitParser.EMBL, changer)); 339 } 340 341 try 342 { 343 store.commit(); 344 } 345 catch (CommitFailure ne) 346 { 347 throw new BioException("Failed to commit new index to file",ne); 348 } 349 } 350 351 private static void _indexGenbank(BioStoreFactory bsf, 352 File location, File [] seqFiles) 353 throws FileNotFoundException, IOException, ParserException, 354 BioException 355 { 356 bsf.setPrimaryKey("LOCUS"); 357 bsf.setStoreLocation(location); 358 bsf.addKey("LOCUS", 10); 359 bsf.addKey("ACCESSION", 10); 360 361 BioStore store = bsf.createBioStore(); 362 363 for (int i = 0; i < seqFiles.length; i++) 364 { 365 Indexer indexer = new Indexer(seqFiles[i], store); 366 indexer.setPrimaryKeyName("LOCUS"); 367 indexer.addSecondaryKey("ACCESSION"); 368 369 ChangeTable changeTable = new ChangeTable(); 370 371 changeTable.setChanger("LOCUS", new ChangeTable.Changer() 372 { 373 public Object change(Object value) 374 { 375 String s = (String) value; 376 int i = s.indexOf(" "); 377 378 if (i < 0) 379 return s; 380 else 381 return s.substring(0, i); 382 } 383 }); 384 385 ValueChanger changer = new ValueChanger(indexer, changeTable); 386 Parser parser = new Parser(); 387 388 while(parser.read(indexer.getReader(), 389 LineSplitParser.GENBANK, changer)); 390 } 391 392 try 393 { 394 store.commit(); 395 } 396 catch (CommitFailure ne) 397 { 398 throw new BioException("Failed to commit new index to file",ne); 399 } 400 } 401}