xiatian 1 jaar geleden
bovenliggende
commit
8aeca3e30f

+ 57 - 21
examcloud-core-questions-base/src/main/java/cn/com/qmth/examcloud/core/questions/base/word/DocxProcessUtil.java

@@ -12,6 +12,7 @@ import net.sourceforge.jeuclid.context.LayoutContextImpl;
 import net.sourceforge.jeuclid.context.StyleAttributeLayoutContext;
 import net.sourceforge.jeuclid.converter.Converter;
 import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
@@ -45,6 +46,8 @@ import org.dom4j.Namespace;
 import org.dom4j.io.SAXReader;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
 import org.jsoup.select.Elements;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -469,28 +472,61 @@ public final class DocxProcessUtil {
      * @param htmlStr
      * @return
      */
+//    public static String getTextInHtml(String htmlStr) {
+//        htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
+//        if (!htmlStr.startsWith("<p>")) {
+//            return htmlStr;
+//        }
+//
+//        try {
+//            org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
+//            StringBuilder textStr = new StringBuilder();
+//            Elements links = doc.select("p").removeAttr("img");
+//
+//            for (Element link : links) {
+//                textStr.append(link.text().trim());
+//            }
+//
+//            return textStr.toString();
+//        } catch (Exception e) {
+//            LOG.error(e.getMessage(), e);
+//        }
+//
+//        return htmlStr;
+//    }
+    
     public static String getTextInHtml(String htmlStr) {
-        htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
-        if (!htmlStr.startsWith("<p>")) {
-            return htmlStr;
-        }
-
-        try {
-            org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
-            StringBuilder textStr = new StringBuilder();
-            Elements links = doc.select("p").removeAttr("img");
-
-            for (Element link : links) {
-                textStr.append(link.text().trim());
-            }
-
-            return textStr.toString();
-        } catch (Exception e) {
-            LOG.error(e.getMessage(), e);
-        }
-
-        return htmlStr;
-    }
+		htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
+
+		try {
+			org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
+			StringBuilder textStr = new StringBuilder();
+			Elements links = doc.select("body");
+			for (Node node : links.get(0).childNodes()) {
+				getTextByNode(textStr, node);
+			}
+			return textStr.toString();
+		} catch (Exception e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static void getTextByNode(StringBuilder textStr, Node node) {
+
+		if (node instanceof TextNode) {
+			TextNode tn = (TextNode) node;
+			textStr.append(tn.text());
+		} else if (node instanceof Element) {
+			Element e = (Element) node;
+			if (CollectionUtils.isNotEmpty(e.childNodes())) {
+				for (Node snode : e.childNodes()) {
+					getTextByNode(textStr, snode);
+				}
+			}
+		}else {
+//			throw new StatusException("解析出错:"+node);
+		}
+	}
 
     /**
      * 格式化转换后的html(html临时文件)