|
@@ -20,11 +20,11 @@ from tqdm import tqdm
|
|
|
BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
|
|
|
# проверяемая директория
|
|
|
# LECTION_DIR = os.path.join("ISRPO", "Лекции")
|
|
|
-LECTION_DIR = os.path.join("EASvZI", "Лекции")
|
|
|
-# LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
|
|
|
+# LECTION_DIR = os.path.join("EASvZI", "Лекции")
|
|
|
+LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
|
|
|
|
|
|
# ссылка для проверки
|
|
|
-url = "http://213.155.192.79:3001/u20lebed/ISRPO/raw/88f913f3d1af9ee9bed0468b183ec9d4f6e5f2b4/2022-23/%d0%ad%d0%ba%d0%b7%d0%b0%d0%bc%d0%b5%d0%bd/Lebedkin.md"
|
|
|
+url = "http://213.155.192.79:3001/u19-23cukanov/TZI/raw/b722d7d9874f8245f669ee85687b441489281033/2022-23/%d0%94%d0%b8%d1%84.%d0%b7%d0%b0%d1%87%d0%b5%d1%82_2%d1%81%d0%b5%d0%bc/%d0%a6%d1%83%d0%ba%d0%b0%d0%bd%d0%be%d0%b2_17_%d0%b1%d0%b8%d0%bb%d0%b5%d1%82.md"
|
|
|
|
|
|
# ------------------------------- / НАСТРОЙКИ ------------
|
|
|
|
|
@@ -103,35 +103,37 @@ readme_list = readme_html.split("\n")
|
|
|
for readme_str in tqdm(readme_list):
|
|
|
if '[' in readme_str:
|
|
|
# print(f"проверяю {readme_str}")
|
|
|
- variant_name, t = readme_str.split("]")
|
|
|
- variant_name = variant_name.strip("[")
|
|
|
- t, variant_uri = readme_str.split("(")
|
|
|
- variant_uri = variant_uri.replace("),", "")
|
|
|
- variant_uri = variant_uri.replace(")", "")
|
|
|
- variant_uri = variant_uri.strip()
|
|
|
-
|
|
|
- variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
|
|
|
try:
|
|
|
- with open(variant_path, encoding="utf-8") as f:
|
|
|
- variant_html = f.read()
|
|
|
+ variant_name, t = readme_str.split("]")
|
|
|
+ variant_name = variant_name.strip("[")
|
|
|
+ t, variant_uri = readme_str.split("(")
|
|
|
+ variant_uri = variant_uri.replace("),", "")
|
|
|
+ variant_uri = variant_uri.replace(")", "")
|
|
|
+ variant_uri = variant_uri.strip()
|
|
|
+
|
|
|
+ variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
|
|
|
+ try:
|
|
|
+ with open(variant_path, encoding="utf-8") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+ except:
|
|
|
+ with open(variant_path, encoding="cp1251") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+
|
|
|
+ variant_tokens, variant_uniq_text = preprocess_text(variant_html)
|
|
|
+ # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
|
|
|
+
|
|
|
+ # пересечение множеств
|
|
|
+ min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
|
|
|
+ c = list(set(post_tokens) & set(variant_tokens))
|
|
|
+ ratio = (1 - (len(c) / min_tokens_len)) * 100
|
|
|
+ if ratio < min_ratio:
|
|
|
+ min_ratio = ratio
|
|
|
+ min_ratio_file = variant_path
|
|
|
+
|
|
|
+ # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
|
|
|
+ # print()
|
|
|
except:
|
|
|
- with open(variant_path, encoding="cp1251") as f:
|
|
|
- variant_html = f.read()
|
|
|
-
|
|
|
- variant_tokens, variant_uniq_text = preprocess_text(variant_html)
|
|
|
- # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
|
|
|
-
|
|
|
- # пересечение множеств
|
|
|
- min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
|
|
|
- c = list(set(post_tokens) & set(variant_tokens))
|
|
|
- ratio = (1 - (len(c) / min_tokens_len)) * 100
|
|
|
- if ratio < min_ratio:
|
|
|
- min_ratio = ratio
|
|
|
- min_ratio_file = variant_path
|
|
|
-
|
|
|
- # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
|
|
|
- # print()
|
|
|
-
|
|
|
+ print(f"Ошибка распаковки {readme_str}")
|
|
|
|
|
|
print()
|
|
|
print(f"min_ratio: {min_ratio}%")
|