java - convert a word documents to HTML with embedded images by TIKA -

April 15, 2010

i'm new in tika. try convert microsoft word documents html using tika. i'm using tikaondotnet wrapper used tika on .net framework. conversion code following:

        byte[] file = files.tobytearray(new file(@"mypath\document.doc"));         autodetectparser tikaparser = new autodetectparser();          bytearrayoutputstream output = new bytearrayoutputstream();         saxtransformerfactory factory = (saxtransformerfactory)transformerfactory.newinstance();         transformerhandler handler = factory.newtransformerhandler();         handler.gettransformer().setoutputproperty(outputkeys.method, "html");         handler.gettransformer().setoutputproperty(outputkeys.indent, "yes");         handler.gettransformer().setoutputproperty(outputkeys.encoding, "utf-8");         handler.setresult(new streamresult(output));          expandedtitlecontenthandler handler1 = new expandedtitlecontenthandler(handler);          tikaparser.parse(new bytearrayinputstream(file), handler1, new metadata());           file ofile = new file(@"c:\tohtml\text.html");         ofile.createnewfile();         dataoutputstream stream = new dataoutputstream(new fileoutputstream(ofile));         output.writeto(stream);

everything working except embedded images. generated html contains image tag like:

<img src="embedded:image2.wmf" alt="image2.wmf"/>

but image source not exists. please advise me

credits goes @gagravarr.

please note simple implementation of code, original codes available in comment of questions.

this implementation based on tikaondotnet wrapper.....

public class doctohtml {      private tikaconfig config = tikaconfig.getdefaultconfig();     public void convert()     {          byte[] file = files.tobytearray(new file(@"filename.doc"));         autodetectparser tikaparser = new autodetectparser();          bytearrayoutputstream output = new bytearrayoutputstream();         saxtransformerfactory factory = (saxtransformerfactory)transformerfactory.newinstance();         var inputstream = new bytearrayinputstream(file);         //           tohtmlcontenthandler handler = new tohtmlcontenthandler();         var metadata = new metadata();         encodingdetector encodingdetector = new universalencodingdetector();         var encode = encodingdetector.detect(inputstream, metadata) ?? new utf_32();         transformerhandler handler = factory.newtransformerhandler();         handler.gettransformer().setoutputproperty(outputkeys.method, "html");         handler.gettransformer().setoutputproperty(outputkeys.indent, "yes");         handler.gettransformer().setoutputproperty(outputkeys.encoding, encode.tostring());         handler.setresult(new streamresult(output));          contenthandler imagerewriting = new imagerewritingcontenthandler(handler);           //  expandedtitlecontenthandler handler1 = new expandedtitlecontenthandler(handler);         parsecontext context = new parsecontext();         context.set(typeof(embeddeddocumentextractor), new fileembeddeddocumentetractor());          tikaparser.parse(inputstream, imagerewriting, new metadata(), context);           byte[] array =  output.tobytearray();         system.io.file.writeallbytes(@"c:\tohtml\text.html", array);      }       private class imagerewritingcontenthandler : contenthandlerdecorator     {         public imagerewritingcontenthandler(contenthandler handler) : base(handler)         {         }          public override void startelement(string uri, string localname, string name, attributes origattrs)         {             if ("img".equals(localname))             {                 attributesimpl attrs;                 if (origattrs attributesimpl)                     attrs = (attributesimpl)origattrs;                 else                     attrs = new attributesimpl(origattrs);                    (int = 0; < attrs.getlength(); i++)                 {                     if ("src".equals(attrs.getlocalname(i)))                     {                         string src = attrs.getvalue(i);                         if (src.startswith("embedded:"))                         {                             var newsrc = src.replace("embedded:", @"images\");                              attrs.setvalue(i, newsrc);                         }                     }                 }                 attrs.addattribute(null, "width", "width","width", "100px");                 base.startelement(uri, localname, name, attrs);             }             else                 base.startelement(uri, localname, name, origattrs);         }     }      private class fileembeddeddocumentetractor : embeddeddocumentextractor     {         private int count = 0;         public bool shouldparseembedded(metadata m)         {             return true;         }          public void parseembedded(inputstream inputstream, contenthandler contenthandler, metadata metadata, bool outputhtml)         {             detector detector = new defaultdetector();             string name = metadata.get("resourcename");             mediatype contenttype = detector.detect(inputstream, metadata);             if (contenttype.gettype() != "image") return;             var embeddedfile = name;             file outputfile = new file(@"c:\tohtml\images", embeddedfile);             try             {                 using (fileoutputstream os = new fileoutputstream(outputfile))                 {                     var tin = inputstream tikainputstream;                     if (tin != null)                     {                         if (tin.getopencontainer() != null && tin.getopencontainer() directoryentry)                         {                             poifsfilesystem fs = new poifsfilesystem();                              fs.writefilesystem(os);                         }                         else                         {                             ioutils.copy(inputstream, os);                         }                     }                 }             }             catch (exception ex)             {                  throw;             }         }     } }

Search This Blog

Perl

java - convert a word documents to HTML with embedded images by TIKA -

Comments

Post a Comment

Popular posts from this blog

jOOQ update returning clause with Oracle -

java - Warning equals/hashCode on @Data annotation lombok with inheritance -

java - BasicPathUsageException: Cannot join to attribute of basic type -