/*
 * Decompiled with CFR 0.152.
 */
package chemaxon.naming.document;

import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.marvin.io.PositionedInputStream;
import chemaxon.naming.document.D2S;
import chemaxon.naming.document.ExtendedLocator;
import chemaxon.naming.document.Extractor;
import chemaxon.naming.document.HTMLWordReader;
import chemaxon.naming.n2s.Util;
import chemaxon.struc.Molecule;
import chemaxon.util.IO;
import chemaxon.util.LoggingUtil;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;

public class TikaParser
implements Runnable,
ContentHandler {
    private static Logger logger = LoggingUtil.getLogger(TikaParser.class);
    public static final Molecule endOfDocument = new Molecule();
    final BlockingQueue<Molecule> hits;
    private final InputStream document;
    private final Metadata metadata;
    private ParseContext context = new ParseContext();
    private final Extractor extractor;
    private D2S.Options options;
    private boolean ignoreText = false;
    int page = 0;
    private Locator locator;
    private ExtendedLocator extendedLocator;
    private static final Tika tika = TikaParser.getTika();
    private boolean inSDF = false;
    private int charPosition = 1;
    private static final ClassID symyxID = new ClassID(new byte[]{-9, -81, 4, 0, 0, 0, 0, 0, -64, 0, 0, 0, 0, 0, 0, 70}, 0);
    private static final ClassID mrvID = new ClassID(new byte[]{9, 67, -60, -23, -38, 55, -110, 72, -115, 85, 52, -107, 104, -31, 32, -118}, 0);
    private static final ClassID chemdrawID = new ClassID(new byte[]{33, 109, -70, 65, 46, -96, -50, 17, -113, -39, 0, 32, -81, -47, -14, 12}, 0);
    private final EmbeddedDocumentExtractor embeddedDocumentExtractor = new EmbeddedDocumentExtractor(){

        public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Parsing embedded document: " + metadata);
            }
            if (stream instanceof TikaInputStream) {
                try {
                    this.parseTikaInputStream((TikaInputStream)stream, metadata);
                }
                catch (IOException e) {
                    logger.log(Level.WARNING, "Error while parsing embedded document", e);
                }
            } else if (logger.isLoggable(Level.FINE)) {
                logger.fine("Unknown document: " + metadata.get("resourceName"));
            }
        }

        private void parseTikaInputStream(TikaInputStream tin, Metadata metadata) throws IOException {
            Object container = tin.getOpenContainer();
            if (container == null && POIFSFileSystem.hasPOIFSHeader((InputStream)tin)) {
                this.parseDirectory(new POIFSFileSystem((InputStream)tin).getRoot());
            } else if (container instanceof DirectoryNode) {
                this.parseDirectory((DirectoryNode)container);
            } else {
                byte[] data;
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("Non DirectoryNode container: " + container);
                }
                if ((data = IO.readBytes((InputStream)tin))[4] == 1 && data[5] == 3 && data[6] == 0 && data[7] == 4) {
                    this.parseSymyx(data);
                } else {
                    this.parseRaw(data);
                }
            }
        }

        private void parseRaw(byte[] data) throws IOException {
            ByteArrayInputStream in = new ByteArrayInputStream(data);
            try {
                MolImporter imp = new MolImporter(in, null);
                Molecule m = imp.read();
                if (m != null) {
                    m.setProperty("type", imp.getFormat());
                    m.setAbsStereo(true);
                    this.addHit(m);
                }
            }
            catch (MolFormatException e) {
                logger.fine("Not a recognized format");
            }
        }

        private void parseDirectory(DirectoryNode dir) throws IOException {
            if (dir.getStorageClsid().equals((Object)symyxID)) {
                this.parseSymyx(dir);
            } else if (dir.getStorageClsid().equals((Object)mrvID)) {
                this.parseMrv(dir);
            } else if (dir.getStorageClsid().equals((Object)chemdrawID)) {
                this.parseChemdraw(dir);
            } else if (logger.isLoggable(Level.FINE)) {
                logger.fine("Unknown document class: " + dir.getStorageClsid());
            }
        }

        private void parseSymyx(DirectoryNode dir) throws IOException {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Found embedded document: Symyx (" + dir.getName() + ")");
            }
            DocumentEntry entry = (DocumentEntry)dir.getEntry("Contents");
            byte[] data = this.read(entry);
            this.parseSymyx(data);
        }

        private void parseSymyx(byte[] data) throws IOException {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Read " + data.length + " bytes for Symyx");
            }
            int len0 = data[0] & 0xFF;
            int len1 = data[1] & 0xFF;
            int len = len1 * 256 + len0;
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Length= " + len);
            }
            byte[] mol = new byte[len];
            System.arraycopy(data, 4, mol, 0, len);
            Molecule m = MolImporter.importMol(mol);
            if (m != null) {
                m.setProperty("type", "symyx");
                this.addHit(m);
            } else if (logger.isLoggable(Level.WARNING)) {
                logger.warning("Could not import structure: " + new String(mol));
            }
        }

        private void parseMrv(DirectoryNode dir) throws IOException {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Found embedded document: MRV (" + dir.getName() + ")");
            }
            DocumentEntry entry = (DocumentEntry)dir.getEntry("Contents");
            byte[] data = this.read(entry);
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Read " + data.length + " bytes");
            }
            int len0 = data[4] & 0xFF;
            int len1 = data[5] & 0xFF;
            int len = len1 * 256 + len0;
            int start = 6;
            if (len == 65535) {
                start = 10;
                len0 = data[6] & 0xFF;
                len1 = data[7] & 0xFF;
                int len2 = data[8] & 0xFF;
                int len3 = data[9] & 0xFF;
                len = len2 * 65536 + len1 * 256 + len0;
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("Length= " + len + " (len0=" + len0 + ",len1=" + len1 + ",len2=" + len2 + ",len3=" + len3 + ")");
                }
            } else if (logger.isLoggable(Level.FINE)) {
                logger.fine("Length= " + len + " (len0=" + len0 + ",len1=" + len1 + ")");
            }
            byte[] mrv = new byte[len];
            for (int i = 0; i < len; ++i) {
                mrv[i] = data[start + i * 2];
            }
            Molecule m = MolImporter.importMol(mrv, "mrv", null);
            m.setProperty("type", "mrv");
            this.addHit(m);
        }

        private void addHit(Molecule m) {
            try {
                TikaParser.this.hits.put(m);
            }
            catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        private void parseChemdraw(DirectoryNode dir) throws IOException {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Found embedded document: ChemDraw (" + dir.getName() + ")");
            }
            DocumentEntry entry = (DocumentEntry)dir.getEntry("CONTENTS");
            byte[] data = this.read(entry);
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Read " + data.length + " bytes");
            }
            Molecule m = MolImporter.importMol(data, "cdx", null);
            m.setProperty("type", "cdx");
            m.setAbsStereo(true);
            this.addHit(m);
        }

        private byte[] read(DocumentEntry entry) throws IOException {
            DocumentInputStream in = new DocumentInputStream(entry);
            return IO.readBytes((InputStream)in);
        }

        private void debugDump(byte[] data, String name) throws IOException {
            File f = new File("/tmp/" + name);
            FileOutputStream w = new FileOutputStream(f);
            ((OutputStream)w).write(data);
            ((OutputStream)w).close();
        }

        public boolean shouldParseEmbedded(Metadata metadata) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Embedded document: " + metadata);
            }
            return true;
        }
    };

    public TikaParser(InputStream document, Metadata metadata, BlockingQueue<Molecule> hits, D2S.Options options) {
        this.document = document;
        this.metadata = metadata;
        this.hits = hits;
        this.options = options;
        this.extractor = new Extractor(0, false, true, options);
        if (options.getInsideTag() != null) {
            this.ignoreText = true;
        }
        this.context.set(EmbeddedDocumentExtractor.class, (Object)this.embeddedDocumentExtractor);
        this.context.set(D2S.Options.class, (Object)options);
    }

    static Tika getTika() {
        try {
            return new Tika(new TikaConfig(TikaParser.class.getResource("/chemaxon/naming/document/d2s-tika-config.xml")));
        }
        catch (Exception e) {
            logger.log(Level.WARNING, "Failed to load custom tika config", e);
            return new Tika();
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public void run() {
        try {
            this.parse();
        }
        finally {
            try {
                this.hits.put(endOfDocument);
            }
            catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    }

    private void parse() {
        try {
            tika.getParser().parse(this.document, (ContentHandler)this, this.metadata, this.context);
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        catch (SAXException e) {
            e.printStackTrace();
        }
        catch (TikaException e) {
            e.printStackTrace();
        }
    }

    private void pushHits() {
        for (Extractor.Token t : this.extractor.hits) {
            Extractor.Hit hit = (Extractor.Hit)t;
            this.pushHit(this.createHit(hit));
        }
        this.extractor.hits.clear();
    }

    private void pushHit(Molecule hit) {
        long bytePos;
        if (this.page > 0) {
            hit.setPropertyObject("page", this.page);
        }
        if ((bytePos = this.getBytePos(hit)) > 0L) {
            hit.setPropertyObject("byte", bytePos);
        }
        try {
            this.hits.put(hit);
        }
        catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }

    private long getBytePos(Molecule hit) {
        Integer charPos = (Integer)hit.properties().getObject("character");
        if (charPos != null) {
            return charPos.longValue() + (long)Util.getProperty(hit, "text").length();
        }
        if (this.document instanceof PositionedInputStream) {
            PositionedInputStream sin = (PositionedInputStream)this.document;
            long bytePos = sin.getFilePointer();
            return bytePos;
        }
        return 0L;
    }

    @Override
    public void setDocumentLocator(Locator locator) {
        logger.finest("Document has Locator");
        this.locator = locator;
        this.extendedLocator = locator instanceof ExtendedLocator ? (ExtendedLocator)locator : this.getXercesLocator(locator);
    }

    private ExtendedLocator getXercesLocator(Locator locator) {
        return null;
    }

    private ExtendedLocator failedLocator(Exception e) {
        logger.log(Level.WARNING, "Failed to get Locator, character locations will not be reported", e);
        return null;
    }

    @Override
    public void startDocument() throws SAXException {
        logger.fine("Starting processing document as " + this.metadata.get("Content-Type"));
    }

    @Override
    public void endDocument() throws SAXException {
        this.extractor.processAll();
        this.pushHits();
        this.ignoreText = true;
    }

    @Override
    public void startPrefixMapping(String prefix, String uri) throws SAXException {
    }

    @Override
    public void endPrefixMapping(String prefix) throws SAXException {
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
        if (HTMLWordReader.breaksSentences(qName) || "sdf".equals(qName)) {
            this.extractor.processAll();
            this.pushHits();
        }
        if ("div".equals(qName) && "page".equals(atts.getValue("class"))) {
            ++this.page;
        }
        if ("sdf".equals(qName)) {
            this.inSDF = true;
        }
        if (this.ignoreText && this.options.getInsideTag().equals(qName)) {
            this.ignoreText = false;
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        if ("sdf".equals(qName)) {
            this.inSDF = false;
        }
        if (qName.equals(this.options.getInsideTag())) {
            this.ignoreText = true;
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (this.ignoreText) {
            return;
        }
        String text = new String(ch, start, length);
        if (this.inSDF) {
            this.processSDF(text);
            return;
        }
        this.pushText(text);
        this.extractor.process(10);
        this.pushHits();
    }

    private void processSDF(String sdf) {
        ByteArrayInputStream in = new ByteArrayInputStream(sdf.getBytes(IO.UTF8));
        try {
            Molecule m;
            MolImporter imp = new MolImporter(in, "sdf", IO.UTF8.displayName());
            while ((m = imp.read()) != null) {
                double confidence = this.getConfidence(m);
                this.pushHit(m);
            }
        }
        catch (IOException e) {
            logger.log(Level.WARNING, "Error while importing data after OSR", e);
        }
    }

    private double getConfidence(Molecule m) {
        String confidenceStr = m.getProperty("Confidence_Estimate");
        if (confidenceStr == null) {
            return 0.0;
        }
        m.setProperty("confidence", confidenceStr);
        m.properties().setObject("Confidence_Estimate", null);
        try {
            return Double.parseDouble(confidenceStr);
        }
        catch (NumberFormatException e) {
            return 0.0;
        }
    }

    private void pushText(String text) {
        if (logger.isLoggable(Level.FINER)) {
            logger.finer("Got text: '" + text + "' at location " + this.display(this.locator));
        }
        if ("text/plain".equals(this.metadata.get("Content-Type"))) {
            this.extractor.putChunk(text, this.charPosition);
            this.charPosition += text.length();
        } else {
            if (this.locator != null) {
                if (this.extendedLocator != null) {
                    this.extractor.putChunk(text, this.extendedLocator.getCharacterOffset() - text.length());
                    return;
                }
                if (this.locator.getLineNumber() == 0 && this.locator.getColumnNumber() > 0) {
                    this.extractor.putChunk(text, this.locator.getColumnNumber() - text.length());
                    return;
                }
            }
            this.extractor.putChunk(text);
        }
    }

    private String display(Locator locator) {
        if (locator == null) {
            return "<NULL>";
        }
        if (this.extendedLocator != null) {
            return "<char=" + this.extendedLocator.getCharacterOffset() + ", line=" + locator.getLineNumber() + ", col=" + locator.getColumnNumber() + ">";
        }
        return "<line=" + locator.getLineNumber() + ", col=" + locator.getColumnNumber() + ">";
    }

    private Molecule createHit(Extractor.Hit hit) {
        Molecule res = hit.molecule;
        res.setProperty("text", hit.name);
        if (hit.value != null) {
            int position = (Integer)hit.value;
            int index = hit.text.indexOf(hit.name);
            if (index > 0) {
                position += index;
            }
            if (position > 0) {
                res.setPropertyObject("character", position);
            }
        }
        return res;
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
    }

    @Override
    public void processingInstruction(String target, String data) throws SAXException {
    }

    @Override
    public void skippedEntity(String name) throws SAXException {
    }
}

