/*
 * Decompiled with CFR 0.152.
 */
package chemaxon.naming;

import chemaxon.common.util.IntVector;
import chemaxon.formats.MolExporter;
import chemaxon.license.LicenseHandler;
import chemaxon.marvin.io.MolExportException;
import chemaxon.marvin.io.MonitorableInputStream;
import chemaxon.marvin.io.formats.name.nameexport.Util;
import chemaxon.naming.document.D2S;
import chemaxon.naming.document.Extractor;
import chemaxon.naming.document.HTMLWordReader;
import chemaxon.naming.document.PDFBridge;
import chemaxon.naming.document.WordReader;
import chemaxon.struc.Molecule;
import chemaxon.util.IO;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

public class DocumentExtractor {
    private boolean mergeMultipleHits = false;
    private MonitorableInputStream in;
    private BufferedReader reader;
    private List<Hit> hits = new ArrayList<Hit>();
    Extractor ex = new Extractor(1000, false, true, new D2S.Options());
    private static boolean countLines = false;
    private static String encoding;
    int pageNum = -1;
    public static final String propertySourceDocument = "document";
    public static final String propertyPage = "page";

    public DocumentExtractor() {
    }

    public DocumentExtractor(File document) throws IOException {
        this(document, null);
    }

    public DocumentExtractor(File document, String encoding) throws IOException {
        InputStream in = new FileInputStream(document);
        if (document.getName().endsWith(".gz")) {
            in = new GZIPInputStream(in);
        }
        this.setSource(document.length(), in, encoding);
    }

    public DocumentExtractor(URL document) throws IOException {
        this(document.openConnection());
    }

    public DocumentExtractor(URLConnection document) throws IOException {
        String encoding = "ISO-8859-1";
        String contentType = document.getContentType();
        int encodingStart = contentType.indexOf("charset=");
        if (encodingStart != -1) {
            encoding = contentType.substring(encodingStart + 8);
        }
        this.setSource(document.getContentLength(), document.getInputStream(), encoding);
    }

    private void setSource(long totalByteSize, InputStream in, String encoding) throws UnsupportedEncodingException {
        if (totalByteSize > 0L) {
            this.in = new MonitorableInputStream(in, (int)totalByteSize);
            in = this.in;
        }
        InputStreamReader r = encoding == null ? new InputStreamReader(in) : new InputStreamReader(in, encoding);
        this.reader = new BufferedReader(r, 100);
    }

    public DocumentExtractor(Reader r) {
        this.reader = new BufferedReader(r);
    }

    public DocumentExtractor(String text) {
        byte[] bytes = text.getBytes(IO.UTF8);
        ByteArrayInputStream in = new ByteArrayInputStream(bytes);
        try {
            this.setSource(bytes.length, in, "UTF-8");
        }
        catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    public void setCasNumberLookup(boolean value) {
        this.ex.options.enableCAS(value);
    }

    public void acceptElements(boolean on) {
        this.ex.acceptElements(on);
    }

    public void acceptIons(boolean on) {
        this.ex.acceptIons(on);
    }

    public void acceptGenericNames(boolean on) {
        this.ex.acceptGenericNames(on);
    }

    private static void usage() {
        System.out.println("java chemaxon.naming.DocumentExtractor [-xml] [-smiles|-mrv|-text] [-line|-char] [-o outputFile] inputFile");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void main(String[] args) {
        String argument;
        DocumentExtractor x = null;
        InputStream in = null;
        String outputFormat = "text";
        encoding = null;
        OutputStream outStream = System.out;
        boolean markup = false;
        boolean verbose = false;
        boolean mergeMultipleHits = false;
        int arg = 0;
        ProgressListener listener = null;
        while (arg < args.length && (argument = args[arg].intern()).startsWith("-") && argument.length() > 1) {
            ++arg;
            if (argument.equals("-o")) {
                String outputFile = args[arg++];
                try {
                    outStream = new FileOutputStream(outputFile);
                }
                catch (IOException e) {
                    System.err.println(e);
                    System.exit(1);
                }
                if (outputFile.endsWith(".smi") || outputFile.endsWith(".cxsmi")) {
                    outputFormat = "smiles";
                    continue;
                }
                if (!outputFile.endsWith(".mrv")) continue;
                outputFormat = "mrv";
                continue;
            }
            if (argument.equals("-line")) {
                countLines = true;
                continue;
            }
            if (argument.equals("-char")) {
                countLines = false;
                continue;
            }
            if (argument.equals("-smiles")) {
                outputFormat = "smiles";
                continue;
            }
            if (argument == "-mrv") {
                outputFormat = "mrv";
                continue;
            }
            if (argument.equals("-text")) {
                outputFormat = "text";
                continue;
            }
            if (argument.equals("-count")) {
                outputFormat = "count";
                continue;
            }
            if (argument == "-xml") {
                markup = true;
                continue;
            }
            if (argument == "-merge") {
                mergeMultipleHits = true;
                continue;
            }
            if (argument.equals("-encoding")) {
                encoding = args[arg++];
                continue;
            }
            if (argument.equals("-verbose")) {
                verbose = true;
                continue;
            }
            if (argument == "-progress") {
                listener = new ProgressListener(){

                    @Override
                    public boolean progress(ProgressInfo info) {
                        System.out.println("PROGRESS: " + info.getCharactersRead() + "/" + info.getEstimatedTotalCharacters());
                        return false;
                    }
                };
                continue;
            }
            System.err.println("Unknown argument: " + argument);
            System.exit(1);
        }
        if (encoding == null && verbose) {
            System.out.println("Using system default encoding: " + Charset.defaultCharset().displayName());
        }
        if (arg == args.length - 1) {
            String inFileName = args[arg];
            if (inFileName.equals("-")) {
                in = System.in;
            } else if (inFileName.startsWith("http://")) {
                try {
                    x = new DocumentExtractor(new URL(inFileName));
                }
                catch (IOException e) {
                    System.err.println(e.getMessage());
                    System.exit(1);
                }
            } else if (inFileName.endsWith(".pdf")) {
                File pdf = new File(inFileName);
                try {
                    x = DocumentExtractor.readPDF(pdf);
                }
                catch (IOException e) {
                    System.err.println("Error while reading PDF file: " + e);
                    System.exit(1);
                }
            } else {
                try {
                    x = new DocumentExtractor(new File(inFileName), encoding);
                }
                catch (UnsupportedEncodingException e) {
                    System.err.println("The specified encoding is not supported: " + encoding);
                    System.exit(1);
                }
                catch (IOException e) {
                    System.err.println("Error reading file: " + inFileName);
                    return;
                }
            }
        } else {
            DocumentExtractor.usage();
            System.exit(1);
        }
        if (x == null) {
            InputStreamReader r = null;
            try {
                r = encoding == null ? new InputStreamReader(in) : new InputStreamReader(in, encoding);
            }
            catch (UnsupportedEncodingException e) {
                System.err.println("The specified encoding is not supported: " + encoding);
                System.exit(1);
            }
            x = new DocumentExtractor(r);
        }
        x.mergeMultipleHits = mergeMultipleHits;
        x.ex.dbg = verbose;
        try {
            if (markup) {
                x.processHTML(listener);
            } else {
                x.processPlainText(listener);
            }
        }
        catch (IOException e) {
            System.err.println("Error reading input: " + e);
        }
        if (outputFormat == "count") {
            System.out.println(x.getHits().size());
            return;
        }
        if (outputFormat == "mrv") {
            try {
                MolExporter exp = new MolExporter(outStream, "mrv");
                for (Hit hit : x.getHits()) {
                    hit.structure.setProperty("position", hit.getPositionsString());
                    exp.write(hit.structure);
                }
                exp.close();
            }
            catch (IOException e) {
                System.err.println(e);
            }
            return;
        }
        PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream));
        try {
            if (outputFormat == "text") {
                for (Hit hit : x.getHits()) {
                    try {
                        out.println(hit.position + ": " + hit.text + ": " + hit.structure.exportToFormat("cxsmiles"));
                    }
                    catch (MolExportException e) {
                        System.err.println("Error exporting structure for '" + hit.text + "': " + e);
                    }
                }
            } else {
                out.println("#SMILES\tName\t" + (countLines ? "Line" : "Character"));
                for (Hit hit : x.getHits()) {
                    try {
                        out.println(hit.structure.exportToFormat("cxsmiles") + "\t" + hit.text + "\t" + hit.getPositionsString());
                    }
                    catch (MolExportException e) {
                        System.err.println("Error exporting structure for '" + hit.text + "': " + e);
                    }
                }
            }
        }
        finally {
            out.close();
        }
    }

    public static void printEncodingError() {
        if (encoding == null) {
            System.err.println("The data contains characters that are invalid in the system encoding (" + Charset.defaultCharset().displayName() + ").\n" + "You should specify the file's character encoding using \"-encoding ENCODING_NAME\" or fix the input.");
        } else {
            System.err.println("The data contains characters that are invalid in the specified encoding (" + encoding + ").\n" + "You should specify the file's actual character encoding or fix the input.");
        }
    }

    public void processPlainText(Reader r) throws IOException {
        this.reader = new BufferedReader(r);
        this.processPlainText();
    }

    public void processPlainText() throws IOException {
        this.processPlainText((ProgressListener)null);
    }

    public void processPlainText(ProgressListener progressListener) throws IOException {
        if (this.reader == null) {
            throw new Error("No document to read from. Use the appropriate constructor for the DocumentExtractor class");
        }
        this.process(new WordReader(this.reader), progressListener);
    }

    public void processHTML(Reader r) throws IOException {
        this.reader = new BufferedReader(r);
        this.processHTML();
    }

    public void processHTML() throws IOException {
        this.processHTML((ProgressListener)null);
    }

    public void processHTML(ProgressListener progressListener) throws IOException {
        if (this.reader == null) {
            throw new Error("No document to read from. Use the appropriate constructor for the DocumentExtractor class");
        }
        this.process(new HTMLWordReader(this.reader), progressListener);
    }

    private void process(WordReader in, ProgressListener progressListener) throws IOException {
        String word;
        long estimatedTotalCharactersCandidate;
        LicenseHandler.getInstance().checkLicense("Document to Structure");
        ProgressInfo info = progressListener == null ? null : new ProgressInfo();
        long lastPos = 0L;
        long l = estimatedTotalCharactersCandidate = this.in == null ? -1L : this.in.getSize();
        while ((word = in.readWord()) != null) {
            if (word == WordReader.newSentenceMarker) {
                this.getHits();
                continue;
            }
            if (word.startsWith("<<PAGE")) {
                this.getHits();
                this.pageNum = Integer.parseInt(word.substring("<<PAGE".length(), word.length() - 2));
            }
            this.ex.putWord(word, countLines ? in.getLine() : in.getWordStart());
            if (progressListener == null) continue;
            ++info.wordsRead;
            info.charactersRead = in.getCharacterPosition();
            info.hitsFound = this.ex.hits.size();
            if (this.in != null) {
                if (this.in.getPosition() != lastPos) {
                    lastPos = this.in.getPosition();
                    info.estimatedTotalCharacters = estimatedTotalCharactersCandidate;
                }
                estimatedTotalCharactersCandidate = this.in.getSize() * (long)info.charactersRead / this.in.getPosition();
            }
            if (!progressListener.progress(info)) continue;
            break;
        }
    }

    public List<Hit> getHits() {
        this.ex.processAll();
        for (Extractor.Token t : this.ex.hits) {
            Extractor.Hit hit = (Extractor.Hit)t;
            int position = (Integer)hit.value;
            int index = hit.text.indexOf(hit.name);
            if (index > 0) {
                position += index;
            }
            this.addHit(hit.name, hit.molecule, position);
        }
        this.ex.hits.clear();
        return this.hits;
    }

    private void addHit(String name, Molecule structure, int position) {
        if (!this.mergeMultipleHits) {
            this.hits.add(new Hit(name, structure, position));
        } else {
            for (Hit hit : this.hits) {
                if (!hit.text.equals(name)) continue;
                hit.allPositions.add(position);
                return;
            }
            Hit hit = new Hit(name, structure, position);
            hit.allPositions = new IntVector();
            hit.allPositions.add(position);
            this.hits.add(hit);
        }
    }

    public void clearHits() {
        this.hits.clear();
    }

    public static DocumentExtractor readPDF(File pdf) throws IOException {
        return PDFBridge.readPDF(pdf, new BufferedInputStream(new FileInputStream(pdf)));
    }

    public static DocumentExtractor readPDF(InputStream pdfStream) throws IOException {
        return PDFBridge.readPDF(null, pdfStream);
    }

    public static interface ProgressListener {
        public boolean progress(ProgressInfo var1);
    }

    public static class ProgressInfo
    implements Cloneable {
        int charactersRead;
        int wordsRead;
        int hitsFound;
        long estimatedTotalCharacters = -1L;

        public int getCharactersRead() {
            return this.charactersRead;
        }

        public int getWordsRead() {
            return this.wordsRead;
        }

        public int getHitsFound() {
            return this.hitsFound;
        }

        public long getEstimatedTotalCharacters() {
            if (this.estimatedTotalCharacters == -1L) {
                return -1L;
            }
            return Math.max(this.estimatedTotalCharacters, (long)this.charactersRead);
        }
    }

    public class Hit {
        public final String text;
        public final Molecule structure;
        public final int position;
        private IntVector allPositions;
        int pageNumber;

        Hit(String text, Molecule structure, int position) {
            this.text = text;
            this.structure = structure;
            this.position = position;
            this.pageNumber = DocumentExtractor.this.pageNum;
        }

        public final int[] getAllPositions() {
            if (this.allPositions == null) {
                return null;
            }
            return this.allPositions.toArray();
        }

        public String getPositionsString() {
            if (this.allPositions != null) {
                return this.allPositions.toString();
            }
            return Integer.toString(this.position);
        }

        public String toString() {
            return this.position + ":" + this.text + ":" + Util.getDictionarySmiles(this.structure);
        }

        public int getPageNumber() {
            return this.pageNumber;
        }
    }
}

