|
@@ -12,7 +12,6 @@ import net.sourceforge.jeuclid.context.LayoutContextImpl;
|
|
import net.sourceforge.jeuclid.context.StyleAttributeLayoutContext;
|
|
import net.sourceforge.jeuclid.context.StyleAttributeLayoutContext;
|
|
import net.sourceforge.jeuclid.converter.Converter;
|
|
import net.sourceforge.jeuclid.converter.Converter;
|
|
import org.apache.commons.codec.binary.Base64;
|
|
import org.apache.commons.codec.binary.Base64;
|
|
-import org.apache.commons.collections4.CollectionUtils;
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
import org.apache.commons.io.FileUtils;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
@@ -45,9 +44,6 @@ import org.dom4j.DocumentHelper;
|
|
import org.dom4j.Namespace;
|
|
import org.dom4j.Namespace;
|
|
import org.dom4j.io.SAXReader;
|
|
import org.dom4j.io.SAXReader;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.Jsoup;
|
|
-import org.jsoup.nodes.Element;
|
|
|
|
-import org.jsoup.nodes.Node;
|
|
|
|
-import org.jsoup.nodes.TextNode;
|
|
|
|
import org.jsoup.select.Elements;
|
|
import org.jsoup.select.Elements;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
import org.slf4j.LoggerFactory;
|
|
@@ -501,38 +497,10 @@ public final class DocxProcessUtil {
|
|
}
|
|
}
|
|
htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
|
|
htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
|
|
|
|
|
|
- try {
|
|
|
|
- org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
|
|
|
|
- StringBuilder textStr = new StringBuilder();
|
|
|
|
- Elements links = doc.select("body");
|
|
|
|
- if(links==null||links.size()==0) {
|
|
|
|
- return htmlStr;
|
|
|
|
- }
|
|
|
|
- for (Node node : links.get(0).childNodes()) {
|
|
|
|
- getTextByNode(textStr, node);
|
|
|
|
- }
|
|
|
|
- return textStr.toString();
|
|
|
|
- } catch (Exception e) {
|
|
|
|
- throw new RuntimeException(e);
|
|
|
|
- }
|
|
|
|
|
|
+ org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
|
|
|
|
+ return doc.text();
|
|
}
|
|
}
|
|
|
|
|
|
- private static void getTextByNode(StringBuilder textStr, Node node) {
|
|
|
|
-
|
|
|
|
- if (node instanceof TextNode) {
|
|
|
|
- TextNode tn = (TextNode) node;
|
|
|
|
- textStr.append(tn.text());
|
|
|
|
- } else if (node instanceof Element) {
|
|
|
|
- Element e = (Element) node;
|
|
|
|
- if (CollectionUtils.isNotEmpty(e.childNodes())) {
|
|
|
|
- for (Node snode : e.childNodes()) {
|
|
|
|
- getTextByNode(textStr, snode);
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }else {
|
|
|
|
-// throw new StatusException("解析出错:"+node);
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
* 格式化转换后的html(html临时文件)
|
|
* 格式化转换后的html(html临时文件)
|