8 anos atrás · 533360b5e9
--- a/cqb-comm-utils/src/main/java/com/qmth/cqb/utils/ConvertOutHtml.java
+++ b/cqb-comm-utils/src/main/java/com/qmth/cqb/utils/ConvertOutHtml.java
@@ -0,0 +1,147 @@
 
				+package com.qmth.cqb.utils;

			
 
				+

			
 
				+import java.io.ByteArrayOutputStream;

			
 
				+import java.io.FileOutputStream;

			
 
				+import java.io.OutputStream;

			
 
				+import org.docx4j.Docx4J;

			
 
				+import org.docx4j.Docx4jProperties;

			
 
				+import org.docx4j.convert.out.ConversionFeatures;

			
 
				+import org.docx4j.convert.out.HTMLSettings;

			
 
				+import org.docx4j.convert.out.html.SdtToListSdtTagHandler;

			
 
				+import org.docx4j.convert.out.html.SdtWriter;

			
 
				+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

			
 
				+import org.docx4j.samples.AbstractSample;

			
 
				+

			
 
				+

			
 
				+public class ConvertOutHtml extends AbstractSample  {

			
 
				+	

			
 
				+	

			
 
				+

			
 
				+		// Config for non-command line version

			
 
				+		static {

			
 
				+		

			
 
				+	    	inputfilepath = System.getProperty("user.dir") + "/test/test.docx";

			
 
				+

			
 
				+			save = false;

			
 
				+			nestLists = false;

			
 
				+		}

			
 
				+

			
 
				+		static boolean save;

			
 
				+		static boolean nestLists;

			
 
				+

			
 
				+	    public static void main(String[] args)

			
 
				+	            throws Exception {

			
 
				+	    	

			
 
				+			try {

			
 
				+				getInputFilePath(args);

			
 
				+			} catch (IllegalArgumentException e) {

			
 
				+			}

			
 
				+			

			
 
				+			// Document loading (required)

			
 
				+			WordprocessingMLPackage wordMLPackage;

			
 
				+			if (inputfilepath==null) {

			
 
				+				// Create a docx

			
 
				+				System.out.println("No imput path passed, creating dummy document");

			
 
				+				 wordMLPackage = WordprocessingMLPackage.createPackage();

			
 
				+				//SampleDocument.createContent(wordMLPackage.getMainDocumentPart());	

			
 
				+			} else {

			
 
				+				System.out.println("Loading file from " + inputfilepath);

			
 
				+				wordMLPackage = Docx4J.load(new java.io.File(inputfilepath));

			
 
				+			}

			
 
				+

			
 
				+			// HTML exporter setup (required)

			
 
				+			// .. the HTMLSettings object

			
 
				+	    	HTMLSettings htmlSettings = Docx4J.createHTMLSettings();

			
 
				+

			
 
				+	    	htmlSettings.setImageDirPath(inputfilepath + "_files");

			
 
				+	    	htmlSettings.setImageTargetUri(inputfilepath.substring(inputfilepath.lastIndexOf("/")+1)

			
 
				+	    			+ "_files");

			
 
				+	    	htmlSettings.setWmlPackage(wordMLPackage);

			
 
				+	    	

			
 
				+	    	

			
 
				+	    	/* CSS reset, see http://itumbcom.blogspot.com.au/2013/06/css-reset-how-complex-it-should-be.html 

			
 
				+	    	 * 

			
 
				+	    	 * motivated by vertical space in tables in Firefox and Google Chrome.

			
 
				+	        

			
 
				+		        If you have unwanted vertical space, in Chrome this may be coming from -webkit-margin-before and -webkit-margin-after

			
 
				+		        (in Firefox, margin-top is set to 1em in html.css)

			
 
				+		        

			
 
				+		        Setting margin: 0 on p is enough to fix it.

			
 
				+		        

			
 
				+		        See further http://www.css-101.org/articles/base-styles-sheet-for-webkit-based-browsers/    	

			
 
				+	    	*/

			
 
				+	    	String userCSS = null;

			
 
				+	    	if (nestLists) {

			
 
				+	    		// use browser defaults for ol, ul, li

			
 
				+	    		userCSS = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img,  table, caption, tbody, tfoot, thead, tr, th, td " +

			
 
				+	    			"{ margin: 0; padding: 0; border: 0;}" +

			
 
				+	    			"body {line-height: 1;} ";

			
 
				+	    	} else {

			
 
				+	    		userCSS = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img,  ol, ul, li, table, caption, tbody, tfoot, thead, tr, th, td " +

			
 
				+	        			"{ margin: 0; padding: 0; border: 0;}" +

			
 
				+	        			"body {line-height: 1;} ";

			
 
				+	    		

			
 
				+	    	}

			
 
				+	    	htmlSettings.setUserCSS(userCSS);

			
 
				+	    	

			
 
				+	    	

			
 
				+	    	//Other settings (optional)

			
 
				+//	    	htmlSettings.setUserBodyTop("<H1>TOP!</H1>");

			
 
				+//	    	htmlSettings.setUserBodyTail("<H1>TAIL!</H1>");

			
 
				+			

			
 
				+			// Sample sdt tag handler (tag handlers insert specific

			
 
				+			// html depending on the contents of an sdt's tag).

			
 
				+			// This will only have an effect if the sdt tag contains

			
 
				+			// the string @class=XXX

			
 
				+//				SdtWriter.registerTagHandler("@class", new TagClass() );

			
 
				+			

			
 
				+//			SdtWriter.registerTagHandler(Containerization.TAG_BORDERS, new TagSingleBox() );

			
 
				+//			SdtWriter.registerTagHandler(Containerization.TAG_SHADING, new TagSingleBox() );

			
 
				+	    	

			
 
				+	    	

			
 
				+	    	// list numbering:  depending on whether you want list numbering hardcoded, or done using <li>.

			
 
				+	    	if (nestLists) {

			
 
				+	    		SdtWriter.registerTagHandler("HTML_ELEMENT", new SdtToListSdtTagHandler());

			
 
				+	    	} else {

			
 
				+	    		htmlSettings.getFeatures().remove(ConversionFeatures.PP_HTML_COLLECT_LISTS);

			
 
				+	    	}

			
 
				+			

			
 
				+			// output to an OutputStream.		

			
 
				+			OutputStream os; 

			
 
				+			if (save) {

			
 
				+				os = new FileOutputStream(inputfilepath + ".html");

			
 
				+			} else {

			
 
				+				os = new ByteArrayOutputStream();

			
 
				+				

			
 
				+			}

			
 
				+

			
 
				+			// If you want XHTML output

			
 
				+	    	Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true);

			
 
				+

			
 
				+			//Don't care what type of exporter you use

			
 
				+			Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_NONE);

			
 
				+			//Prefer the exporter, that uses a xsl transformation

			
 
				+			//Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);

			
 
				+			//Prefer the exporter, that doesn't use a xsl transformation (= uses a visitor)

			
 
				+//			Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_EXPORT_PREFER_NONXSL);

			
 
				+			//String originalHtml = new String(os.toString(), "UTF-8");

			
 
				+			if (save) {

			
 
				+				System.out.println("Saved: " + inputfilepath + ".html ");

			
 
				+			} else {

			
 
				+				//System.out.println( ((ByteArrayOutputStream)os).toString() );

			
 
				+				String originalHtml = ((ByteArrayOutputStream)os).toString();

			
 
				+				//System.out.println(CommonUtils.extractText(html));

			
 
				+				originalHtml = originalHtml.substring(originalHtml.indexOf("<body>") + "<body>".length(),originalHtml.indexOf("</body"));

			
 
				+				originalHtml = originalHtml.substring(originalHtml.indexOf("<p"),originalHtml.lastIndexOf("</span>"))+"</span></p>";

			
 
				+				String[] trr = originalHtml.split("\\[.*试题分类.*\\]:");

			
 
				+			}

			
 
				+

			
 
				+			// Clean up, so any ObfuscatedFontPart temp files can be deleted 

			
 
				+			if (wordMLPackage.getMainDocumentPart().getFontTablePart()!=null) {

			
 
				+				wordMLPackage.getMainDocumentPart().getFontTablePart().deleteEmbeddedFontTempFiles();

			
 
				+			}		

			
 
				+			// This would also do it, via finalize() methods

			
 
				+			htmlSettings = null;

			
 
				+			wordMLPackage = null;

			
 
				+	    }

			
 
				+}

			
--- a/cqb-comm-utils/src/main/java/com/qmth/cqb/utils/ParserDocxUtils.java
+++ b/cqb-comm-utils/src/main/java/com/qmth/cqb/utils/ParserDocxUtils.java
@@ -0,0 +1,73 @@
 
				+package com.qmth.cqb.utils;

			
 
				+

			
 
				+import java.io.File;

			
 
				+import java.net.URL;

			
 
				+import java.util.ArrayList;

			
 
				+import java.util.List;

			
 
				+

			
 
				+import javax.xml.bind.JAXBElement;

			
 
				+

			
 
				+import org.apache.commons.lang3.StringUtils;

			
 
				+import org.docx4j.Docx4J;

			
 
				+import org.docx4j.TraversalUtil;

			
 
				+import org.docx4j.XmlUtils;

			
 
				+import org.docx4j.convert.out.html.AbstractHtmlExporter.HtmlSettings;

			
 
				+import org.docx4j.finders.ClassFinder;

			
 
				+import org.docx4j.openpackaging.exceptions.Docx4JException;

			
 
				+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

			
 
				+import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;

			
 
				+import org.docx4j.wml.ArrayListWml;

			
 
				+import org.docx4j.wml.Document;

			
 
				+import org.docx4j.wml.FldChar;

			
 
				+import org.docx4j.wml.P;

			
 
				+import org.docx4j.wml.R;

			
 
				+import org.docx4j.wml.RPr;

			
 
				+import org.docx4j.wml.Text;

			
 
				+

			
 
				+public class ParserDocxUtils {

			
 
				+	

			
 
				+	public static void main(String[] args) throws Docx4JException{

			
 
				+	     parserDocx("/test/test.docx");

			
 
				+	}

			
 
				+

			
 
				+	private static void parserDocx(String docxFilePath) throws Docx4JException {

			
 
				+		WordprocessingMLPackage  wordMLPackage = WordprocessingMLPackage.load(new File(System.getProperty("user.dir")+docxFilePath));

			
 
				+		MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();

			
 
				+		ClassFinder finder = new ClassFinder(P.class); // <----- change this to suit

			
 
				+		new TraversalUtil(mdp.getContent(), finder);

			
 
				+		List<Object> objs = finder.results;

			
 
				+		List<String> pStrs = new ArrayList<String>();

			
 
				+		List<P> wps = new ArrayList<P>();

			
 
				+		//int index = 0;

			
 
				+		for (Object object :objs) {

			
 
				+				  if (object instanceof P) {

			
 
				+		                P p = (P) object;

			
 
				+		            	//wps.add(p);

			
 
				+		                System.out.println(XmlUtils.marshaltoString(p));

			
 
				+		                ClassFinder tFinder = new ClassFinder(Text.class); 

			
 
				+		                new TraversalUtil(p.getContent(), tFinder);

			
 
				+		                List<Object> tList = tFinder.results;

			
 
				+		                for (Object t :tList) {//排除空段落

			
 
				+		                	if(t instanceof Text && StringUtils.isNotEmpty(((Text) t).getValue())){

			
 
				+		                		wps.add(p);

			
 
				+		                		pStrs.add(XmlUtils.marshaltoString(p));

			
 
				+		                	    break;

			
 
				+		                	}

			
 
				+		                }

			
 
				+//		                System.out.println(t.getValue());

			
 
				+//		                if (t.getValue().equals("1")) {

			
 
				+//		                    if (objs.size() > index + 1) {

			
 
				+//		                        Text tnext = (Text) (objs.get(index + 1));

			
 
				+//		                        R r = (R) (tnext.getParent());

			
 
				+//		                        P p = (P) r.getParent();

			
 
				+//		                        System.out.println(XmlUtils.marshaltoString(p));

			
 
				+//		                    }

			
 
				+//		                }

			
 
				+		            }

			
 
				+		        }

			
 
				+		for(int i=0;i<pStrs.size();i++){

			
 
				+			

			
 
				+		}

			
 
				+		    

			
 
				+    }

			
 
				+}