|
@@ -12,6 +12,7 @@ import net.sourceforge.jeuclid.context.LayoutContextImpl;
|
|
|
import net.sourceforge.jeuclid.context.StyleAttributeLayoutContext;
|
|
|
import net.sourceforge.jeuclid.converter.Converter;
|
|
|
import org.apache.commons.codec.binary.Base64;
|
|
|
+import org.apache.commons.collections4.CollectionUtils;
|
|
|
import org.apache.commons.io.FileUtils;
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
@@ -45,6 +46,8 @@ import org.dom4j.Namespace;
|
|
|
import org.dom4j.io.SAXReader;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.nodes.Node;
|
|
|
+import org.jsoup.nodes.TextNode;
|
|
|
import org.jsoup.select.Elements;
|
|
|
import org.slf4j.Logger;
|
|
|
import org.slf4j.LoggerFactory;
|
|
@@ -469,28 +472,61 @@ public final class DocxProcessUtil {
|
|
|
* @param htmlStr
|
|
|
* @return
|
|
|
*/
|
|
|
+// public static String getTextInHtml(String htmlStr) {
|
|
|
+// htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
|
|
|
+// if (!htmlStr.startsWith("<p>")) {
|
|
|
+// return htmlStr;
|
|
|
+// }
|
|
|
+//
|
|
|
+// try {
|
|
|
+// org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
|
|
|
+// StringBuilder textStr = new StringBuilder();
|
|
|
+// Elements links = doc.select("p").removeAttr("img");
|
|
|
+//
|
|
|
+// for (Element link : links) {
|
|
|
+// textStr.append(link.text().trim());
|
|
|
+// }
|
|
|
+//
|
|
|
+// return textStr.toString();
|
|
|
+// } catch (Exception e) {
|
|
|
+// LOG.error(e.getMessage(), e);
|
|
|
+// }
|
|
|
+//
|
|
|
+// return htmlStr;
|
|
|
+// }
|
|
|
+
|
|
|
public static String getTextInHtml(String htmlStr) {
|
|
|
- htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
|
|
|
- if (!htmlStr.startsWith("<p>")) {
|
|
|
- return htmlStr;
|
|
|
- }
|
|
|
-
|
|
|
- try {
|
|
|
- org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
|
|
|
- StringBuilder textStr = new StringBuilder();
|
|
|
- Elements links = doc.select("p").removeAttr("img");
|
|
|
-
|
|
|
- for (Element link : links) {
|
|
|
- textStr.append(link.text().trim());
|
|
|
- }
|
|
|
-
|
|
|
- return textStr.toString();
|
|
|
- } catch (Exception e) {
|
|
|
- LOG.error(e.getMessage(), e);
|
|
|
- }
|
|
|
-
|
|
|
- return htmlStr;
|
|
|
- }
|
|
|
+ htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
|
|
|
+
|
|
|
+ try {
|
|
|
+ org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
|
|
|
+ StringBuilder textStr = new StringBuilder();
|
|
|
+ Elements links = doc.select("body");
|
|
|
+ for (Node node : links.get(0).childNodes()) {
|
|
|
+ getTextByNode(textStr, node);
|
|
|
+ }
|
|
|
+ return textStr.toString();
|
|
|
+ } catch (Exception e) {
|
|
|
+ throw new RuntimeException(e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void getTextByNode(StringBuilder textStr, Node node) {
|
|
|
+
|
|
|
+ if (node instanceof TextNode) {
|
|
|
+ TextNode tn = (TextNode) node;
|
|
|
+ textStr.append(tn.text());
|
|
|
+ } else if (node instanceof Element) {
|
|
|
+ Element e = (Element) node;
|
|
|
+ if (CollectionUtils.isNotEmpty(e.childNodes())) {
|
|
|
+ for (Node snode : e.childNodes()) {
|
|
|
+ getTextByNode(textStr, snode);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+// throw new StatusException("解析出错:"+node);
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
/**
|
|
|
* 格式化转换后的html(html临时文件)
|