001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.io.embl;
022
023
024import java.io.*;
025import java.util.Arrays;
026import java.util.LinkedList;
027
028
029/**
030 * This class should process the data of embl file
031 *
032 * @author Noor Aldeen Al Mbaidin
033 * @since 5.0.0
034 */
035public class EmblReader {
036
037    /**
038     * The parsing is done in this method.<br>
039     * This method tries to process all the Embl records
040     * in the File , closes the underlying resource,
041     * and return the results in object of EmblRecord.<br>
042     *
043     * @return EmblRecord containing all the parsed Embl records
044     * @throws IOException
045     */
046    public static EmblRecord process(File file) throws IOException {
047
048        EmblRecord emblRecord = new EmblRecord();
049        StringBuilder sequence = new StringBuilder("");
050        LinkedList<EmblReference> emblReferences = new LinkedList<>();
051        EmblReference emblReference = new EmblReference();
052        LinkedList<String> accessionNumber = new LinkedList<>();
053        LinkedList<String> keyword = new LinkedList<>();
054
055        if (file == null)
056            throw new NullPointerException("file can't be null");
057
058        if (file.isDirectory())
059            throw new IllegalArgumentException("the file can't be a directory");
060
061        try (FileReader fileReader = new FileReader(file)) {
062            String line = "";
063            String lineIdentifier;
064            String lineInfo;
065            try (BufferedReader bufferedReader = new BufferedReader(fileReader)) {
066                while ((line = bufferedReader.readLine()) != null) {
067                    if (line.length() > 1) {
068                        lineInfo = line.substring(2, line.length()).trim();
069                        lineIdentifier = line.substring(0, 2);
070                        if (lineIdentifier.equals("ID"))
071                            emblRecord.setEmblId(populateID(lineInfo));
072                        else if (lineIdentifier.equals("AC"))
073                            populateAccessionNumber(line, accessionNumber);
074                        else if (lineIdentifier.equals("DT") && line.contains("Created"))
075                            emblRecord.setCreatedDate(lineInfo);
076                        else if (lineIdentifier.equals("DT") && line.contains("updated"))
077                            emblRecord.setLastUpdatedDate(lineInfo);
078                        else if (lineIdentifier.equals("DE"))
079                            emblRecord.setSequenceDescription(lineInfo);
080                        else if (lineIdentifier.equals("KW"))
081                            keyword.add(lineInfo);
082                        else if (lineIdentifier.equals("OS"))
083                            emblRecord.setOrganismSpecies(lineInfo);
084                        else if (lineIdentifier.equals("OC"))
085                            emblRecord.setOrganismClassification(lineInfo);
086                        else if (lineIdentifier.equals("OG"))
087                            emblRecord.setOrGanelle(lineInfo);
088                        else if (lineIdentifier.equals("RN") || lineIdentifier.equals("RP")
089                                || lineIdentifier.equals("RX") || lineIdentifier.equals("RG")
090                                || lineIdentifier.equals("RA") || lineIdentifier.equals("RT")
091                                || lineIdentifier.equals("RL"))
092                            populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences);
093                        else if (lineIdentifier.equals("DR"))
094                            emblRecord.setDatabaseCrossReference(lineInfo);
095                        else if (lineIdentifier.equals("AH"))
096                            emblRecord.setAssemblyHeader(lineInfo);
097                        else if (lineIdentifier.equals("AS"))
098                            emblRecord.setAssemblyInformation(lineInfo);
099                        else if (lineIdentifier.equals("CO"))
100                            emblRecord.setConstructedSequence(lineInfo);
101                        else if (lineIdentifier.equals("FH"))
102                            emblRecord.setFeatureHeader(lineInfo);
103                        else if (lineIdentifier.equals("FT"))
104                            emblRecord.setFeatureTable(lineInfo);
105                        else if (lineIdentifier.equals("SQ"))
106                            emblRecord.setSequenceHeader(lineInfo);
107                        else if (lineIdentifier.equals("  ") && !lineIdentifier.equals("//"))
108                            populateSequence(line, sequence);
109                        else if (lineIdentifier.equals("//")) {
110                            emblRecord.setKeyword(keyword);
111                            emblRecord.setEmblReference(emblReferences);
112                            emblRecord.setAccessionNumber(accessionNumber);
113                            emblRecord.setSequence(sequence.toString());
114                        }
115
116                    }
117                }
118            }
119        }
120
121        return emblRecord;
122    }
123
124    private static void populateSequence(String line, StringBuilder sequence) {
125        String sequenceLine = line.replace(" ", "").
126                replaceAll("[0-9]", "");
127        sequence.append(sequenceLine);
128    }
129
130    private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference
131            , LinkedList<EmblReference> emblReferences) {
132        if (lineIdentifier.equals("RN"))
133            emblReference.setReferenceNumber(lineInfo);
134        else if (lineIdentifier.equals("RP"))
135            emblReference.setReferencePosition(lineInfo);
136        else if (lineIdentifier.equals("RX"))
137            emblReference.setReferenceCrossReference(lineInfo);
138        else if (lineIdentifier.equals("RG"))
139            emblReference.setReferenceGroup(lineInfo);
140        else if (lineIdentifier.equals("RA"))
141            emblReference.setReferenceAuthor(lineInfo);
142        else if (lineIdentifier.equals("RT"))
143            emblReference.setReferenceTitle(lineInfo);
144        else if (lineIdentifier.equals("RL")) {
145            emblReference.setReferenceLocation(lineInfo);
146            emblReferences.add(emblReference.copyEmblReference(emblReference));
147        }
148    }
149
150    private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) {
151        accessionNumber.add(line);
152    }
153
154    private static EmblId populateID(String line) {
155        String[] strings = line.split(";");
156        Arrays.stream(strings).map(String::trim).toArray(unused -> strings);
157        EmblId emblId = new EmblId(strings[0], strings[1], strings[2]
158                , strings[3], strings[4], strings[5], strings[6]);
159        return emblId;
160    }
161
162
163}