001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.io.embl;
022
023
024import java.io.*;
025import java.util.Arrays;
026import java.util.LinkedList;
027
028
029/**
030 * This class should process the data of embl file
031 *
032 * @author Noor Aldeen Al Mbaidin
033 * @since 5.0.0
034 */
035public class EmblReader {
036
037        /**
038         * The parsing is done in this method.<br>
039         * This method tries to process all the Embl records
040         * in the File , closes the underlying resource,
041         * and return the results in object of EmblRecord.<br>
042         *
043         * @return EmblRecord containing all the parsed Embl records
044         * @throws IOException
045         */
046        public static EmblRecord process(File file) throws IOException {
047
048                EmblRecord emblRecord = new EmblRecord();
049                StringBuilder sequence = new StringBuilder("");
050                LinkedList<EmblReference> emblReferences = new LinkedList<>();
051                EmblReference emblReference = new EmblReference();
052                LinkedList<String> accessionNumber = new LinkedList<>();
053                LinkedList<String> keyword = new LinkedList<>();
054
055                if (file == null)
056                        throw new NullPointerException("file can't be null");
057
058                if (file.isDirectory())
059                        throw new IllegalArgumentException("the file can't be a directory");
060
061                try (FileReader fileReader = new FileReader(file)) {
062                        String line = "";
063                        String lineIdentifier;
064                        String lineInfo;
065                        try (BufferedReader bufferedReader = new BufferedReader(fileReader)) {
066                                while ((line = bufferedReader.readLine()) != null) {
067                                        if (line.length() > 1) {
068                                                lineInfo = line.substring(2, line.length()).trim();
069                                                lineIdentifier = line.substring(0, 2);
070                                                if ("ID".equals(lineIdentifier))
071                                                        emblRecord.setEmblId(populateID(lineInfo));
072                                                else if ("AC".equals(lineIdentifier))
073                                                        populateAccessionNumber(line, accessionNumber);
074                                                else if ("DT".equals(lineIdentifier) && line.contains("Created"))
075                                                        emblRecord.setCreatedDate(lineInfo);
076                                                else if ("DT".equals(lineIdentifier) && line.contains("updated"))
077                                                        emblRecord.setLastUpdatedDate(lineInfo);
078                                                else if ("DE".equals(lineIdentifier))
079                                                        emblRecord.setSequenceDescription(lineInfo);
080                                                else if ("KW".equals(lineIdentifier))
081                                                        keyword.add(lineInfo);
082                                                else if ("OS".equals(lineIdentifier))
083                                                        emblRecord.setOrganismSpecies(lineInfo);
084                                                else if ("OC".equals(lineIdentifier))
085                                                        emblRecord.setOrganismClassification(lineInfo);
086                                                else if ("OG".equals(lineIdentifier))
087                                                        emblRecord.setOrGanelle(lineInfo);
088                                                else if ("RN".equals(lineIdentifier) || "RP".equals(lineIdentifier)
089                                                                || "RX".equals(lineIdentifier) || "RG".equals(lineIdentifier)
090                                                                || "RA".equals(lineIdentifier) || "RT".equals(lineIdentifier)
091                                                                || "RL".equals(lineIdentifier))
092                                                        populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences);
093                                                else if ("DR".equals(lineIdentifier))
094                                                        emblRecord.setDatabaseCrossReference(lineInfo);
095                                                else if ("AH".equals(lineIdentifier))
096                                                        emblRecord.setAssemblyHeader(lineInfo);
097                                                else if ("AS".equals(lineIdentifier))
098                                                        emblRecord.setAssemblyInformation(lineInfo);
099                                                else if ("CO".equals(lineIdentifier))
100                                                        emblRecord.setConstructedSequence(lineInfo);
101                                                else if ("FH".equals(lineIdentifier))
102                                                        emblRecord.setFeatureHeader(lineInfo);
103                                                else if ("FT".equals(lineIdentifier))
104                                                        emblRecord.setFeatureTable(lineInfo);
105                                                else if ("SQ".equals(lineIdentifier))
106                                                        emblRecord.setSequenceHeader(lineInfo);
107                                                else if ("  ".equals(lineIdentifier) && !"//".equals(lineIdentifier))
108                                                        populateSequence(line, sequence);
109                                                else if ("//".equals(lineIdentifier)) {
110                                                        emblRecord.setKeyword(keyword);
111                                                        emblRecord.setEmblReference(emblReferences);
112                                                        emblRecord.setAccessionNumber(accessionNumber);
113                                                        emblRecord.setSequence(sequence.toString());
114                                                }
115
116                                        }
117                                }
118                        }
119                }
120
121                return emblRecord;
122        }
123
124        private static void populateSequence(String line, StringBuilder sequence) {
125                String sequenceLine = line.replace(" ", "").
126                                replaceAll("[0-9]", "");
127                sequence.append(sequenceLine);
128        }
129
130        private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference
131                        , LinkedList<EmblReference> emblReferences) {
132                if ("RN".equals(lineIdentifier))
133                        emblReference.setReferenceNumber(lineInfo);
134                else if ("RP".equals(lineIdentifier))
135                        emblReference.setReferencePosition(lineInfo);
136                else if ("RX".equals(lineIdentifier))
137                        emblReference.setReferenceCrossReference(lineInfo);
138                else if ("RG".equals(lineIdentifier))
139                        emblReference.setReferenceGroup(lineInfo);
140                else if ("RA".equals(lineIdentifier))
141                        emblReference.setReferenceAuthor(lineInfo);
142                else if ("RT".equals(lineIdentifier))
143                        emblReference.setReferenceTitle(lineInfo);
144                else if ("RL".equals(lineIdentifier)) {
145                        emblReference.setReferenceLocation(lineInfo);
146                        emblReferences.add(emblReference.copyEmblReference(emblReference));
147                }
148        }
149
150        private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) {
151                accessionNumber.add(line);
152        }
153
154        private static EmblId populateID(String line) {
155                String[] strings = line.split(";");
156                Arrays.stream(strings).map(String::trim).toArray(unused -> strings);
157                EmblId emblId = new EmblId(strings[0], strings[1], strings[2]
158                                , strings[3], strings[4], strings[5], strings[6]);
159                return emblId;
160        }
161
162
163}