فهرست منبع

相似度分析增加html特殊字符过滤

宋悦 7 سال پیش
والد
کامیت
403a9fdf8e
1فایلهای تغییر یافته به همراه3 افزوده شده و 3 حذف شده
  1. 3 3
      cqb-comm-utils/src/main/java/com/qmth/cqb/utils/StringSimilarityUtils.java

+ 3 - 3
cqb-comm-utils/src/main/java/com/qmth/cqb/utils/StringSimilarityUtils.java

@@ -272,13 +272,12 @@ public class StringSimilarityUtils {
      * @return
      */
     public static String stringFilter(String str) {
-        String regEx = "[_`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]";
+        String regEx = "\\&[a-zA-Z]{1,10};|[_`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]";
         return str.replaceAll(regEx, "").trim();
-
     }
 
     public static void main(String[] args) {
-        String str1 = "秦汉以i am filter he is hehe abc来的公文程式构成有\n<><>_________________ !!!!!";
+        String str1 = "秦汉以i am filter he is hehe abc来的公文程式构成有\n<><>_________________ !!!!!&nbsp;&copy;&lt;&gt;&nbsp;&nbsp;";
         String str2 = "More roads than one lead to the mountain village.";
         // System.out.println(StringSimilarityUtils.stringFilter(str1));
         // System.out.println(StringSimilarityUtils.stringFilter(str2));
@@ -293,6 +292,7 @@ public class StringSimilarityUtils {
         // System.out.println(similarity_cos);
         // System.out.println(similarity_dice);
         // System.out.println(similarity_diceopt);
+        System.out.println(stringFilter(str1));
         System.out.println(segmentText(stringFilter(str1)));
     }
 }