BioJava:Cookbook:Sequence:Regex

From BioJava

Jump to: navigation, search

How can make a motif into a regular expression?

One of the interesting things you can do with BioJava's MotifTools is to make a String into a regular expression Pattern. You can then use this Pattern to search a SymbolList object for the existence of that Pattern. The generated Pattern can even be from an ambiguous sequence such as "acgytnwacrs". To accomplish this task, BioJava contains a wrapper to Java's built-in regular expression functionality.

A simple program is implemented below:

// Biojava imports
import org.biojava.bio.*;
 
public class AmbiguitySearch {
	public static void main(String[] args) {
	   try {
		// Variables needed...
		Matcher occurences;
		FiniteAlphabet IUPAC = DNATools.getDNA();
		SymbolList WorkingSequence = DNATools.createDNA("tagagatagacgatagc");
 
		// Create pattern using pattern factory.
		Pattern pattern;
		PatternFactory FACTORY = PatternFactory.makeFactory(IUPAC);
		try{
			pattern = FACTORY.compile("wtagn");
		} catch(Exception e) {e.printStackTrace(); return;}
			System.out.println("Searching for: "+pattern.patternAsString());
 
		// Obtain iterator of matches.
		try {
			occurences = pattern.matcher( WorkingSequence );
		} catch(Exception e) {e.printStackTrace(); return;}
 
		// Foreach match
		while( occurences.find() ) {
			System.out.println("Match: " +"\t"+ WorkingSequence.seqString() 
                        +"\n"+ occurences.start() +"\t"+ occurences.group().seqString());
		}
		}
 
		catch (Exception ex) {
			ex.printStackTrace();
			System.exit(1);
		}
	}
}

Alternatively, the following example from Andy Hammer demonstrates how the java regular expression functionality can be used directly to search a String object for pattern matches.

/**
 * MotifLister.java
 * Modified slightly from the original by Andy Hammer
 *
 * Lists all instances of a motif in specified (dna\rna\protein) fasta file.
 * The motif can contain Ambiguity symbols
 * Lists the ORF title and position of motif
 * Outputs a list of counts to stdout.
 */
import java.io.*;
import java.util.*;
 
import org.biojava.*
public class MotifLister{
 
  private SymbolList motif;
  private int frame;
  private int count;
  private SequenceIterator si;
 
 
  public MotifLister(String type, String inputFile,
                     String target, String placement)throws Exception{
 
    System.out.println("MotifLister is searching file " + inputFile +
                       " for the motif '" + target +
                        "' in frame " + placement + ".");
 
    try{
      if(type.equalsIgnoreCase("dna")){
        motif = DNATools.createDNA(target);
      }else if(type.equalsIgnoreCase("rna")){
        motif = RNATools.createRNA(target);
      }else{
        motif = ProteinTools.createProtein(target);
      }
    }
    catch(BioError e){
      System.out.println("Error!!  Data type must match type of motif.");
      System.out.println("Specifically, " + target + " is not " + type);
      System.exit(0);
    }
 
    frame = Integer.parseInt(placement);
 
    if (frame < 0 || frame > 3) {
      System.out.println("Only frames 0 through 3 are allowed");
      System.out.println("frame zero searches all frames.");
      System.exit(0);
    }
 
    //make a regex expression for the SymbolList using MotifTools
    Pattern p = Pattern.compile( MotifTools.createRegex(motif) );
 
    count = 0;
 
    //read the input
    FileInputStream fis = new FileInputStream(inputFile);
    InputStreamReader isr = new InputStreamReader(fis);
    BufferedReader input = new BufferedReader(isr);
 
    try{
      si = (SequenceIterator)SeqIOTools.fileToBiojava("fasta", type, input);
 
      //for each sequence
      while (si.hasNext()){
        Sequence seq = si.nextSequence();
 
        //get the regex matcher for the pattern
        Matcher matcher = p.matcher(seq.seqString());
 
        int start = 0;
 
        //find the next match from start
        while(matcher.find(start)) {
          start = matcher.start();
          int end = matcher.end();
          int result = (start % 3) + 1;
          if(result == frame || frame == 0){
 
            //print the match location
            System.out.println(seq.getName() + " : " +
                               "[" + (start + 1) + "," + (end) + "]");
            count++;
          }
          start++;
        }
      }
 
      input.close(); //close the file
      System.out.println("Total Hits = " + count);
    }
    catch(BioException e){
      System.out.println(inputFile + " is not a " + type + " file.");
      System.out.println(e);
    }
  }
 
  public static void main(String[] args)throws Exception{
    if (args.length < 4) {
      System.err.println(" Usage: >java -jar MotifLister.jar type fastaFile motif frame" +
                         "\n Ex: >java -jar MotifLister.jar dna eColi.fasta AAAAAAG 3 > output.txt" +
                         "\n would search for A AAA AAG in the third frame in dna file eColi.fasta" +
                         "\n and print the results to file output.txt." +
                         "\n 'type' can be dna, rna, or protein." +
                         "\n 'frame' can be integers 0 through 3." +
                         "\n 0 counts any instance of the motif." +
                         "\n 1, 2, 3 counts only instances of the motif in the specified frame." +
                         "\n Capture output with redirection operator '>'.");
    }else{
      MotifLister ML = new MotifLister(args[0], args[1], args[2], args[3]);
    }
  }
}
Personal tools