java - convert a word documents to HTML with embedded images by TIKA -
i'm new in tika. try convert microsoft word documents html using tika. i'm using tikaondotnet wrapper used tika on .net framework. conversion code following:
byte[] file = files.tobytearray(new file(@"mypath\document.doc")); autodetectparser tikaparser = new autodetectparser(); bytearrayoutputstream output = new bytearrayoutputstream(); saxtransformerfactory factory = (saxtransformerfactory)transformerfactory.newinstance(); transformerhandler handler = factory.newtransformerhandler(); handler.gettransformer().setoutputproperty(outputkeys.method, "html"); handler.gettransformer().setoutputproperty(outputkeys.indent, "yes"); handler.gettransformer().setoutputproperty(outputkeys.encoding, "utf-8"); handler.setresult(new streamresult(output)); expandedtitlecontenthandler handler1 = new expandedtitlecontenthandler(handler); tikaparser.parse(new bytearrayinputstream(file), handler1, new metadata()); file ofile = new file(@"c:\tohtml\text.html"); ofile.createnewfile(); dataoutputstream stream = new dataoutputstream(new fileoutputstream(ofile)); output.writeto(stream);
everything working except embedded images. generated html contains image tag like:
<img src="embedded:image2.wmf" alt="image2.wmf"/>
but image source not exists. please advise me
credits goes @gagravarr.
please note simple implementation of code, original codes available in comment of questions.
this implementation based on tikaondotnet wrapper.....
public class doctohtml { private tikaconfig config = tikaconfig.getdefaultconfig(); public void convert() { byte[] file = files.tobytearray(new file(@"filename.doc")); autodetectparser tikaparser = new autodetectparser(); bytearrayoutputstream output = new bytearrayoutputstream(); saxtransformerfactory factory = (saxtransformerfactory)transformerfactory.newinstance(); var inputstream = new bytearrayinputstream(file); // tohtmlcontenthandler handler = new tohtmlcontenthandler(); var metadata = new metadata(); encodingdetector encodingdetector = new universalencodingdetector(); var encode = encodingdetector.detect(inputstream, metadata) ?? new utf_32(); transformerhandler handler = factory.newtransformerhandler(); handler.gettransformer().setoutputproperty(outputkeys.method, "html"); handler.gettransformer().setoutputproperty(outputkeys.indent, "yes"); handler.gettransformer().setoutputproperty(outputkeys.encoding, encode.tostring()); handler.setresult(new streamresult(output)); contenthandler imagerewriting = new imagerewritingcontenthandler(handler); // expandedtitlecontenthandler handler1 = new expandedtitlecontenthandler(handler); parsecontext context = new parsecontext(); context.set(typeof(embeddeddocumentextractor), new fileembeddeddocumentetractor()); tikaparser.parse(inputstream, imagerewriting, new metadata(), context); byte[] array = output.tobytearray(); system.io.file.writeallbytes(@"c:\tohtml\text.html", array); } private class imagerewritingcontenthandler : contenthandlerdecorator { public imagerewritingcontenthandler(contenthandler handler) : base(handler) { } public override void startelement(string uri, string localname, string name, attributes origattrs) { if ("img".equals(localname)) { attributesimpl attrs; if (origattrs attributesimpl) attrs = (attributesimpl)origattrs; else attrs = new attributesimpl(origattrs); (int = 0; < attrs.getlength(); i++) { if ("src".equals(attrs.getlocalname(i))) { string src = attrs.getvalue(i); if (src.startswith("embedded:")) { var newsrc = src.replace("embedded:", @"images\"); attrs.setvalue(i, newsrc); } } } attrs.addattribute(null, "width", "width","width", "100px"); base.startelement(uri, localname, name, attrs); } else base.startelement(uri, localname, name, origattrs); } } private class fileembeddeddocumentetractor : embeddeddocumentextractor { private int count = 0; public bool shouldparseembedded(metadata m) { return true; } public void parseembedded(inputstream inputstream, contenthandler contenthandler, metadata metadata, bool outputhtml) { detector detector = new defaultdetector(); string name = metadata.get("resourcename"); mediatype contenttype = detector.detect(inputstream, metadata); if (contenttype.gettype() != "image") return; var embeddedfile = name; file outputfile = new file(@"c:\tohtml\images", embeddedfile); try { using (fileoutputstream os = new fileoutputstream(outputfile)) { var tin = inputstream tikainputstream; if (tin != null) { if (tin.getopencontainer() != null && tin.getopencontainer() directoryentry) { poifsfilesystem fs = new poifsfilesystem(); fs.writefilesystem(os); } else { ioutils.copy(inputstream, os); } } } } catch (exception ex) { throw; } } } }
Comments
Post a Comment