Kaynağa Gözat

相似度分析增加html特殊字符过滤

宋悦 7 yıl önce
ebeveyn
işleme
7b1a3e5504

+ 3 - 1
cqb-comm-utils/src/main/java/com/qmth/cqb/utils/word/DocxProcessUtil.java

@@ -450,6 +450,7 @@ public final class DocxProcessUtil {
      * @return
      */
     public static String getTextInHtml(String htmlStr){
+        htmlStr = htmlStr.replaceAll("\\&[a-zA-Z]{1,10};", "").trim();
         if(!htmlStr.startsWith("<p>")){
             return htmlStr;
         }
@@ -457,8 +458,9 @@ public final class DocxProcessUtil {
             org.jsoup.nodes.Document doc = Jsoup.parse(htmlStr);
             String textStr = "";
             Elements links = doc.select("p").removeAttr("img");
+
             for (Element link : links) {
-                textStr += link.text();
+                textStr += link.text().trim();
             }
             return textStr;
         } catch (Exception e) {