Ver código fonte

导入测试

gaoxing 8 anos atrás
pai
commit
533360b5e9

+ 147 - 0
cqb-comm-utils/src/main/java/com/qmth/cqb/utils/ConvertOutHtml.java

@@ -0,0 +1,147 @@
+package com.qmth.cqb.utils;
+
+import java.io.ByteArrayOutputStream;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import org.docx4j.Docx4J;
+import org.docx4j.Docx4jProperties;
+import org.docx4j.convert.out.ConversionFeatures;
+import org.docx4j.convert.out.HTMLSettings;
+import org.docx4j.convert.out.html.SdtToListSdtTagHandler;
+import org.docx4j.convert.out.html.SdtWriter;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+import org.docx4j.samples.AbstractSample;
+
+
+public class ConvertOutHtml extends AbstractSample  {
+	
+	
+
+		// Config for non-command line version
+		static {
+		
+	    	inputfilepath = System.getProperty("user.dir") + "/test/test.docx";
+
+			save = false;
+			nestLists = false;
+		}
+
+		static boolean save;
+		static boolean nestLists;
+
+	    public static void main(String[] args)
+	            throws Exception {
+	    	
+			try {
+				getInputFilePath(args);
+			} catch (IllegalArgumentException e) {
+			}
+			
+			// Document loading (required)
+			WordprocessingMLPackage wordMLPackage;
+			if (inputfilepath==null) {
+				// Create a docx
+				System.out.println("No imput path passed, creating dummy document");
+				 wordMLPackage = WordprocessingMLPackage.createPackage();
+				//SampleDocument.createContent(wordMLPackage.getMainDocumentPart());	
+			} else {
+				System.out.println("Loading file from " + inputfilepath);
+				wordMLPackage = Docx4J.load(new java.io.File(inputfilepath));
+			}
+
+			// HTML exporter setup (required)
+			// .. the HTMLSettings object
+	    	HTMLSettings htmlSettings = Docx4J.createHTMLSettings();
+
+	    	htmlSettings.setImageDirPath(inputfilepath + "_files");
+	    	htmlSettings.setImageTargetUri(inputfilepath.substring(inputfilepath.lastIndexOf("/")+1)
+	    			+ "_files");
+	    	htmlSettings.setWmlPackage(wordMLPackage);
+	    	
+	    	
+	    	/* CSS reset, see http://itumbcom.blogspot.com.au/2013/06/css-reset-how-complex-it-should-be.html 
+	    	 * 
+	    	 * motivated by vertical space in tables in Firefox and Google Chrome.
+	        
+		        If you have unwanted vertical space, in Chrome this may be coming from -webkit-margin-before and -webkit-margin-after
+		        (in Firefox, margin-top is set to 1em in html.css)
+		        
+		        Setting margin: 0 on p is enough to fix it.
+		        
+		        See further http://www.css-101.org/articles/base-styles-sheet-for-webkit-based-browsers/    	
+	    	*/
+	    	String userCSS = null;
+	    	if (nestLists) {
+	    		// use browser defaults for ol, ul, li
+	    		userCSS = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img,  table, caption, tbody, tfoot, thead, tr, th, td " +
+	    			"{ margin: 0; padding: 0; border: 0;}" +
+	    			"body {line-height: 1;} ";
+	    	} else {
+	    		userCSS = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img,  ol, ul, li, table, caption, tbody, tfoot, thead, tr, th, td " +
+	        			"{ margin: 0; padding: 0; border: 0;}" +
+	        			"body {line-height: 1;} ";
+	    		
+	    	}
+	    	htmlSettings.setUserCSS(userCSS);
+	    	
+	    	
+	    	//Other settings (optional)
+//	    	htmlSettings.setUserBodyTop("<H1>TOP!</H1>");
+//	    	htmlSettings.setUserBodyTail("<H1>TAIL!</H1>");
+			
+			// Sample sdt tag handler (tag handlers insert specific
+			// html depending on the contents of an sdt's tag).
+			// This will only have an effect if the sdt tag contains
+			// the string @class=XXX
+//				SdtWriter.registerTagHandler("@class", new TagClass() );
+			
+//			SdtWriter.registerTagHandler(Containerization.TAG_BORDERS, new TagSingleBox() );
+//			SdtWriter.registerTagHandler(Containerization.TAG_SHADING, new TagSingleBox() );
+	    	
+	    	
+	    	// list numbering:  depending on whether you want list numbering hardcoded, or done using <li>.
+	    	if (nestLists) {
+	    		SdtWriter.registerTagHandler("HTML_ELEMENT", new SdtToListSdtTagHandler());
+	    	} else {
+	    		htmlSettings.getFeatures().remove(ConversionFeatures.PP_HTML_COLLECT_LISTS);
+	    	}
+			
+			// output to an OutputStream.		
+			OutputStream os; 
+			if (save) {
+				os = new FileOutputStream(inputfilepath + ".html");
+			} else {
+				os = new ByteArrayOutputStream();
+				
+			}
+
+			// If you want XHTML output
+	    	Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true);
+
+			//Don't care what type of exporter you use
+			Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_NONE);
+			//Prefer the exporter, that uses a xsl transformation
+			//Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);
+			//Prefer the exporter, that doesn't use a xsl transformation (= uses a visitor)
+//			Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_EXPORT_PREFER_NONXSL);
+			//String originalHtml = new String(os.toString(), "UTF-8");
+			if (save) {
+				System.out.println("Saved: " + inputfilepath + ".html ");
+			} else {
+				//System.out.println( ((ByteArrayOutputStream)os).toString() );
+				String originalHtml = ((ByteArrayOutputStream)os).toString();
+				//System.out.println(CommonUtils.extractText(html));
+				originalHtml = originalHtml.substring(originalHtml.indexOf("<body>") + "<body>".length(),originalHtml.indexOf("</body"));
+				originalHtml = originalHtml.substring(originalHtml.indexOf("<p"),originalHtml.lastIndexOf("</span>"))+"</span></p>";
+				String[] trr = originalHtml.split("\\[.*试题分类.*\\]:");
+			}
+
+			// Clean up, so any ObfuscatedFontPart temp files can be deleted 
+			if (wordMLPackage.getMainDocumentPart().getFontTablePart()!=null) {
+				wordMLPackage.getMainDocumentPart().getFontTablePart().deleteEmbeddedFontTempFiles();
+			}		
+			// This would also do it, via finalize() methods
+			htmlSettings = null;
+			wordMLPackage = null;
+	    }
+}

+ 73 - 0
cqb-comm-utils/src/main/java/com/qmth/cqb/utils/ParserDocxUtils.java

@@ -0,0 +1,73 @@
+package com.qmth.cqb.utils;
+
+import java.io.File;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.bind.JAXBElement;
+
+import org.apache.commons.lang3.StringUtils;
+import org.docx4j.Docx4J;
+import org.docx4j.TraversalUtil;
+import org.docx4j.XmlUtils;
+import org.docx4j.convert.out.html.AbstractHtmlExporter.HtmlSettings;
+import org.docx4j.finders.ClassFinder;
+import org.docx4j.openpackaging.exceptions.Docx4JException;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
+import org.docx4j.wml.ArrayListWml;
+import org.docx4j.wml.Document;
+import org.docx4j.wml.FldChar;
+import org.docx4j.wml.P;
+import org.docx4j.wml.R;
+import org.docx4j.wml.RPr;
+import org.docx4j.wml.Text;
+
+public class ParserDocxUtils {
+	
+	public static void main(String[] args) throws Docx4JException{
+	     parserDocx("/test/test.docx");
+	}
+
+	private static void parserDocx(String docxFilePath) throws Docx4JException {
+		WordprocessingMLPackage  wordMLPackage = WordprocessingMLPackage.load(new File(System.getProperty("user.dir")+docxFilePath));
+		MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();
+		ClassFinder finder = new ClassFinder(P.class); // <----- change this to suit
+		new TraversalUtil(mdp.getContent(), finder);
+		List<Object> objs = finder.results;
+		List<String> pStrs = new ArrayList<String>();
+		List<P> wps = new ArrayList<P>();
+		//int index = 0;
+		for (Object object :objs) {
+				  if (object instanceof P) {
+		                P p = (P) object;
+		            	//wps.add(p);
+		                System.out.println(XmlUtils.marshaltoString(p));
+		                ClassFinder tFinder = new ClassFinder(Text.class); 
+		                new TraversalUtil(p.getContent(), tFinder);
+		                List<Object> tList = tFinder.results;
+		                for (Object t :tList) {//排除空段落
+		                	if(t instanceof Text && StringUtils.isNotEmpty(((Text) t).getValue())){
+		                		wps.add(p);
+		                		pStrs.add(XmlUtils.marshaltoString(p));
+		                	    break;
+		                	}
+		                }
+//		                System.out.println(t.getValue());
+//		                if (t.getValue().equals("1")) {
+//		                    if (objs.size() > index + 1) {
+//		                        Text tnext = (Text) (objs.get(index + 1));
+//		                        R r = (R) (tnext.getParent());
+//		                        P p = (P) r.getParent();
+//		                        System.out.println(XmlUtils.marshaltoString(p));
+//		                    }
+//		                }
+		            }
+		        }
+		for(int i=0;i<pStrs.size();i++){
+			
+		}
+		    
+    }
+}