Spaces:

inflaton
/

rap

Sleeping

App Files Files Community

dh-mc commited on Oct 22, 2024

Commit

b6836c3

1 Parent(s): 522e057

clean up code

Browse files

Files changed (3) hide show

app.py +1 -2
eval_modules/calc_repetitions_v2e.py +0 -1333
eval_modules/utils.py +67 -156

app.py CHANGED Viewed

@@ -17,8 +17,7 @@ path = os.path.dirname(found_dotenv)
 print(f"Adding {path} to sys.path")
 sys.path.append(path)
-from eval_modules.utils import calc_perf_scores
-from eval_modules.calc_repetitions_v2e import detect_repetitions
 model_name = os.getenv("MODEL_NAME") or "microsoft/Phi-3.5-mini-instruct"
 hf_token = os.getenv("HF_TOKEN")

 print(f"Adding {path} to sys.path")
 sys.path.append(path)
+from eval_modules.utils import calc_perf_scores, detect_repetitions
 model_name = os.getenv("MODEL_NAME") or "microsoft/Phi-3.5-mini-instruct"
 hf_token = os.getenv("HF_TOKEN")

eval_modules/calc_repetitions_v2e.py DELETED Viewed

@@ -1,1333 +0,0 @@
-import os
-import re
-import math
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import matplotlib.ticker as mtick
-import seaborn as sns
-import nltk
-import evaluate
-import traceback
-bert_score = evaluate.load("bertscore")
-meteor = evaluate.load("meteor")
-print(f"loading: {__file__}")
-# pattern_non_word_char_repetition = re.compile(r"\s{5,}")
-# pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
-# final version
-pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}")
-pattern_text_repetitions = re.compile(
-    r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE
-)
-# Explanation of the Regex Pattern:
-#   (?P<repeat>.{5}.*?): Captures any sequence of characters with minimal length of 5 and names this group repeat.
-#     .*?: Matches zero or more characters, non-greedily (as few as possible).
-#   (?:[\s\W]+(?P=repeat))+: A non-capturing group that matches one or more repetitions of:
-#     [\s\W]+: One or more whitespace or non-word characters (spaces, punctuation, etc.).
-#     (?P=repeat): A backreference to the named group repeat.
-def del_non_word_char_repetition(text, debug=False):
-    count = 0
-    if isinstance(text, str):
-        if debug:
-            print("----detect non-word characters repetition----")
-        count = len(text)
-        text = pattern_non_word_char_repetition.sub("\t", text)
-        count -= len(text)
-        if debug and count:
-            print(f"removed non-word characters repetition: {count}")
-    return text, count
-# final version for repetition detection
-def detect_text_repetitions(text, debug=False):
-    count = 0
-    if isinstance(text, str):
-        if debug:
-            print("----detect text repetitions----")
-        matches = pattern_text_repetitions.finditer(text)
-        for match in matches:
-            if debug:
-                print(match)
-                for groupNum in range(0, len(match.groups())):
-                    groupNum = groupNum + 1
-                    print(
-                        "Group {groupNum} found at {start}-{end}: `{group}`".format(
-                            groupNum=groupNum,
-                            start=match.start(groupNum),
-                            end=match.end(groupNum),
-                            group=match.group(groupNum),
-                        )
-                    )
-            start, end = match.span()
-            count += end - start - len(match.group(1))
-    return count
-def detect_repetitions(text, debug=False):
-    if isinstance(text, str) is False:
-        return 0, 0, 0
-    text, count_non_word_char_repetition = del_non_word_char_repetition(
-        text, debug=debug
-    )
-    count_text_repetitions = detect_text_repetitions(text, debug=debug)
-    total_repetitions = count_non_word_char_repetition + count_text_repetitions
-    result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions)
-    if debug:
-        print(result)
-    return result
-def detect_scores(
-    row, debug=False, answer_col="answer", ground_truth_col="ground_truth"
-):
-    newline_score, repetition_score, total_repetitions = detect_repetitions(
-        row[answer_col], debug=debug
-    )
-    if ground_truth_col:
-        ground_truth_newline_score, ground_truth_repetition_score, _ = (
-            detect_repetitions(row[ground_truth_col], debug=debug)
-        )
-        newline_score -= ground_truth_newline_score
-        if newline_score < 0:
-            newline_score = 0
-        repetition_score -= ground_truth_repetition_score
-        if repetition_score < 0:
-            repetition_score = 0
-        total_repetitions = newline_score + repetition_score
-    return pd.Series([newline_score, repetition_score, total_repetitions])
-def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
-    print(f"loading result file: {result_file}")
-    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
-    if (
-        force_recalculate
-        or "newline_score" not in df.columns
-        or "repetition_score" not in df.columns
-        or "total_repetitions" not in df.columns
-        or "nrr" not in df.columns
-        or "rr" not in df.columns
-    ):
-        if (
-            force_recalculate
-            or "newline_score" not in df.columns
-            or "repetition_score" not in df.columns
-            or "total_repetitions" not in df.columns
-        ):
-            df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply(
-                detect_scores, axis=1
-            )
-        df["answer_len"] = df["answer"].apply(
-            lambda x: len(x) if isinstance(x, str) else 0
-        )
-        df["nrr"] = df.apply(
-            lambda x: (
-                1
-                if x["answer_len"] == 0
-                else 1 - (x["newline_score"] + x["repetition_score"]) / x["answer_len"]
-            ),
-            axis=1,
-        )
-        df["rr"] = df["nrr"].apply(lambda x: 1 - x)
-        df.to_csv(result_file, index=False)
-    return df
-def replace_last(source_string, old_string, new_string):
-    head, _sep, tail = source_string.rpartition(old_string)
-    return head + new_string + tail
-def load_for_repetition_penalty(
-    csv_result_file, repetition_penalty, force_recalculate=False
-):
-    result_file = replace_last(
-        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
-    )
-    return load_with_newline_and_repetition_scores(
-        result_file, force_recalculate=force_recalculate
-    )
-rap_penalty_functions = {
-    "linear": lambda x: x,
-    "quadratic": lambda x: x * x,
-    "cubic": lambda x: x * x * x,
-    "logarithmic": lambda x: math.log(x + 1, 2),
-    "exponential": lambda x: math.exp(x - 1),
-}
-def calc_adjusted_performance(f, r, l=1, penalty_function="cubic"):
-    n = 1 - r / l if l > 0 else 0
-    return f * rap_penalty_functions[penalty_function](n)
-def calculate_adjusted_performance(row):
-    r = row["total_repetitions"]
-    l = row["answer_len"]
-    adjusted_precision = calc_adjusted_performance(row["precision"], r, l)
-    adjusted_recall = calc_adjusted_performance(row["recall"], r, l)
-    return pd.Series([adjusted_precision, adjusted_recall])
-def load_performance_df(csv_result_file, repetition_penalty):
-    result_file = replace_last(
-        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
-    )
-    result_file = result_file.replace("/results/", "/eval/")
-    print(f"loading json file: {result_file}")
-    df = pd.read_json(result_file)
-    return df
-def calculate_performance_score(
-    csv_result_file, repetition_penalty, force_recalculate=False
-):
-    result_file = replace_last(
-        csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
-    )
-    if os.path.exists(result_file):
-        print(f"loading result file: {result_file}")
-        df = load_with_newline_and_repetition_scores(
-            result_file, force_recalculate=force_recalculate
-        )
-    else:
-        print(f"re-creating result file: {result_file}")
-        df = pd.DataFrame()
-        force_recalculate = True
-    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
-        try:
-            perf_df = load_performance_df(csv_result_file, repetition_penalty)
-            df.drop(
-                columns=[
-                    "precision",
-                    "recall",
-                    "f1",
-                    "f2",
-                    "entities_in_answer",
-                    "entities_in_question",
-                    "word_count",
-                ],
-                errors="ignore",
-                inplace=True,
-            )
-            df["id"] = perf_df["id"]
-            df["question"] = perf_df["question"]
-            df["answer"] = perf_df["pred_answer"]
-            df["word_count"] = df["answer"].apply(
-                lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
-            )
-            df["ground_truth"] = perf_df["ground_truth"]
-            df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
-            df["precision"] = perf_df["score"].apply(lambda x: x[0])
-            df["recall"] = perf_df["score"].apply(lambda x: x[1])
-            df["f1"] = perf_df["score"].apply(lambda x: x[2])
-        except Exception as e:
-            print(f"\tignored error: {e}")
-            # traceback.print_exc()
-        df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply(
-            detect_scores, axis=1
-        )
-        df["answer_len"] = df["answer"].apply(
-            lambda x: len(x) if isinstance(x, str) else 0
-        )
-        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
-            calculate_adjusted_performance, axis=1
-        )
-        df.to_csv(result_file, index=False)
-        print(f"performance scores saved to result file: {result_file}")
-    # print(f"df len: {len(df)}")
-    return df
-def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
-    newline_score = [
-        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
-    ]
-    repetition_score = [
-        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
-    ]
-    answer_len = [
-        df["answer_len"].mean() for df in result["df_list_repetition_penalty"]
-    ]
-    precision = [
-        calc_adjusted_performance(f, n + r, l)
-        for f, n, r, l in zip(precision, newline_score, repetition_score, answer_len)
-    ]
-    recall = [
-        calc_adjusted_performance(f, n + r, l)
-        for f, n, r, l in zip(recall, newline_score, repetition_score, answer_len)
-    ]
-    return precision, recall
-def plot_performance_scores(
-    result,
-    models=None,
-    title="Performance",
-):
-    if models is None:
-        models = result.keys()
-    for model in models:
-        print(f"model: {model}")
-        df = result[model]["df_overall"]
-        # Calculate the statistics
-        precision = [
-            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
-        ]
-        recall = [
-            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
-        ]
-        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
-        best_f1 = max(f1)
-        best_f1_index = f1.index(best_f1)
-        precision, recall = adjust_perf_scores_with_repetition_penalty(
-            result[model], precision, recall
-        )
-        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
-        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
-        best_afrp = max(afrp)
-        best_afrp_index = afrp.index(best_afrp)
-        adjusted_precision = [
-            df["adjusted_precision"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        adjusted_recall = [
-            df["adjusted_recall"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        afrp2 = [
-            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
-        ]
-        best_afrp2 = max(afrp2)
-        best_afrp2_index = afrp2.index(best_afrp2)
-        repetition_penalties = list(df["repetition_penalty"])
-        # line plot for precision, recall, f1
-        plt.figure(figsize=(10, 6))
-        plt.axvspan(
-            repetition_penalties[best_f1_index] - 0.01,
-            repetition_penalties[best_f1_index] + 0.01,
-            alpha=0.5,
-            edgecolor="none",
-            facecolor="blue",
-        )
-        # plt.axvspan(
-        #     repetition_penalties[best_afrp2_index] - 0.01,
-        #     repetition_penalties[best_afrp2_index] + 0.01,
-        #     alpha=0.5,
-        #     edgecolor="none",
-        #     facecolor="green",
-        # )
-        plt.axvspan(
-            repetition_penalties[best_afrp_index] - 0.01,
-            repetition_penalties[best_afrp_index] + 0.01,
-            alpha=0.5,
-            edgecolor="none",
-            facecolor="orange",
-        )
-        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
-        # plt.plot(
-        #     repetition_penalties,
-        #     afrp2,
-        #     label="Per-question RAP - F1",
-        #     marker="s",
-        #     color="green",
-        # )
-        plt.plot(
-            repetition_penalties,
-            afrp,
-            label="RAP - F1",
-            marker="o",
-            color="orange",
-        )
-        plt.xlabel("Repetition Penalties")
-        plt.ylabel("Score")
-        # plt.xlim(0.99, 1.31)
-        # y in percentage
-        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
-        plt.title(f"{model} {title}")
-        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
-        plt.show()
-def plot_best_afrp(
-    result,
-    models=None,
-    title="Models with Best RAP - F1",
-    ref_result=None,
-):
-    # Initialize lists to store the statistics
-    model_names = []
-    best_f1 = []
-    best_afrp = []
-    best_repetition_penalty = []
-    best_mtr = []
-    if models is None:
-        models = result.keys()
-    for model in models:
-        print(f"model: {model}")
-        df = result[model]["df_overall"]
-        # Calculate the statistics
-        precision = [
-            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
-        ]
-        recall = [
-            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
-        ]
-        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
-        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
-        newline_score = [
-            df["newline_score"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        # print(f"newline_score: {newline_score}")
-        repetition_score = [
-            df["repetition_score"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        # print(f"repetition_score: {repetition_score}")
-        answer_len = [
-            df["answer_len"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        afrp = [
-            calc_adjusted_performance(f, n + r, l)
-            for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len)
-        ]
-        best_afrp.append(max(afrp))
-        best_afrp_index = afrp.index(best_afrp[-1])
-        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
-        best_f1.append(f1[best_afrp_index])
-        best_mtr.append(
-            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
-        )
-        # print(
-        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
-        # )
-        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
-        model_names.append(
-            f"{model} (RP={best_repetition_penalty[-1]})"
-        )  # Add the model name to the list
-    if ref_result is not None:
-        print("ref_result:", ref_result)
-        for model in ref_result.keys():
-            model_names.append(model)
-            df = pd.read_csv(ref_result[model])
-            # df = df[df["id"].isin(wikidata_df["id"])]
-            p = df["precision"].mean()
-            r = df["recall"].mean()
-            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
-            best_f1.append(f1)
-            best_afrp.append(f1)
-            best_mtr.append(0)
-    print("model_names:", model_names)
-    # print("best_f1:", best_f1)
-    # print("best_afrp:", best_afrp)
-    # Create a DataFrame with the statistics
-    data = pd.DataFrame(
-        {
-            "Model": model_names,
-            "RAP - F1": best_afrp,
-            "F1": best_f1,
-        }
-    )
-    # Melt the DataFrame to a long format
-    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
-    # Pivot the DataFrame to a wide format
-    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
-    # make sure the columns are following the order of the models
-    data_pivoted = data_pivoted[model_names]
-    # make sure three groups in the order of precision, recall, f1
-    data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"])
-    # Plot the statistics
-    plt.figure(figsize=(15, 6))
-    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
-    plt.title(title)
-    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
-    # Set the rotation of the x-axis labels to 0 degrees
-    plt.xticks(rotation=0)
-    # Format the y-axis to display as percentage
-    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
-    # get the max value of the y-axis
-    a1 = max(best_afrp)
-    a2 = max(best_f1)
-    max_value = max([a1, a2]) * 1.12
-    print("max_value:", max_value)
-    # Set the y-axis limit up to 70%
-    ax.set_ylim(0, max_value)
-    # Add the values above each bar
-    for p in ax.patches:
-        ax.annotate(
-            f"{p.get_height() * 100:.1f}",
-            (p.get_x() + p.get_width() / 2.0, p.get_height()),
-            ha="center",
-            va="bottom",
-            xytext=(0, 10),
-            textcoords="offset points",
-            rotation=90,
-        )
-    plt.show()
-    return data_pivoted, best_mtr
-def plot_best_performance(
-    result,
-    models=None,
-    title="Models with Best F1 Score",
-    adjusted_f1=False,
-    ref_result=None,
-):
-    # Initialize lists to store the statistics
-    model_names = []
-    best_precision = []
-    best_recall = []
-    best_f1 = []
-    best_repetition_penalty = []
-    best_mtr = []
-    if models is None:
-        models = result.keys()
-    for model in models:
-        print(f"model: {model}")
-        df = result[model]["df_overall"]
-        # Calculate the statistics
-        precision = [
-            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
-        ]
-        recall = [
-            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
-        ]
-        newline_score = [
-            df["newline_score"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        repetition_score = [
-            df["repetition_score"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        if adjusted_f1:
-            precision, recall = adjust_perf_scores_with_repetition_penalty(
-                result[model], precision, recall
-            )
-        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
-        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
-        best_f1.append(max(f1))
-        best_f1_index = f1.index(best_f1[-1])
-        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
-        best_precision.append(precision[best_f1_index])
-        best_recall.append(recall[best_f1_index])
-        best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
-        print(
-            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
-        )
-        df = result[model]["df_list_repetition_penalty"][best_f1_index]
-        model_names.append(
-            f"{model} (RP={best_repetition_penalty[-1]})"
-        )  # Add the model name to the list
-        # print sum for columns: newline_score, repetition_score
-        print(
-            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
-        )
-    if ref_result is not None:
-        print("ref_result:", ref_result)
-        for model in ref_result.keys():
-            model_names.append(model)
-            df = pd.read_csv(ref_result[model])
-            # df = df[df["id"].isin(wikidata_df["id"])]
-            best_precision.append(df["precision"].mean())
-            best_recall.append(df["recall"].mean())
-            f1 = (
-                2
-                * (best_precision[-1] * best_recall[-1])
-                / (best_precision[-1] + best_recall[-1])
-            )
-            # best_f1.append(df["f1"].mean())
-            best_f1.append(f1)
-            best_mtr.append(0)
-    # Create a DataFrame with the statistics
-    data = (
-        pd.DataFrame(
-            {
-                "Model": model_names,
-                "Adjusted Precision with RP": best_precision,
-                "Adjusted Recall with RP": best_recall,
-                "Adjusted F1 with RP": best_f1,
-            }
-        )
-        if adjusted_f1
-        else pd.DataFrame(
-            {
-                "Model": model_names,
-                "Precision": best_precision,
-                "Recall": best_recall,
-                "F1": best_f1,
-            }
-        )
-    )
-    columns = list(data.columns)
-    # Melt the DataFrame to a long format
-    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
-    # Pivot the DataFrame to a wide format
-    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
-    # make sure the columns are following the order of the models
-    data_pivoted = data_pivoted[model_names]
-    # make sure three groups in the order of precision, recall, f1
-    data_pivoted = data_pivoted.reindex(columns[1:])
-    # Plot the statistics
-    plt.figure(figsize=(10, 6))
-    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
-    plt.title(title)
-    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
-    # Set the rotation of the x-axis labels to 0 degrees
-    plt.xticks(rotation=0)
-    # Format the y-axis to display as percentage
-    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
-    # get the max value of the y-axis
-    a1 = max(best_precision)
-    a2 = max(best_recall)
-    a3 = max(best_f1)
-    max_value = max([a1, a2, a3]) * 1.12
-    print("max_value:", max_value)
-    # Set the y-axis limit up to 70%
-    ax.set_ylim(0, max_value)
-    # Add the values above each bar
-    for p in ax.patches:
-        ax.annotate(
-            f"{p.get_height() * 100:.1f}",
-            (p.get_x() + p.get_width() / 2.0, p.get_height()),
-            ha="center",
-            va="bottom",
-            xytext=(0, 10),
-            textcoords="offset points",
-            rotation=90,
-        )
-    plt.show()
-    return data_pivoted, best_mtr
-def plot_best_performance_ms_macro(
-    result,
-    models=None,
-    title="Models with Best RAP - Performance",
-    ref_result=None,
-    skip_generic_prompt=False,
-    include_adjusted_performance=True,
-):
-    # Initialize lists to store the statistics
-    model_names = []
-    best_f1 = []
-    best_afrp = []
-    best_repetition_penalty = []
-    best_bleu1 = []
-    best_rougeL = []
-    best_mtr = []
-    if models is None:
-        models = result.keys()
-    for model in models:
-        if skip_generic_prompt and "generic prompt" in model:
-            continue
-        print(f"model: {model}")
-        df = result[model]["df_overall"]
-        # Calculate the statistics
-        bleu1 = [x for x in df["bleu1"]]
-        rougeL = [x for x in df["rougeL"]]
-        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
-        newline_score = [
-            df["newline_score"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        # print(f"newline_score: {newline_score}")
-        repetition_score = [
-            df["repetition_score"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        # print(f"repetition_score: {repetition_score}")
-        answer_len = [
-            df["answer_len"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        afrp = [
-            calc_adjusted_performance(f, n + r, l)
-            for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len)
-        ]
-        best_afrp.append(max(afrp if include_adjusted_performance else f1))
-        best_afrp_index = (
-            afrp.index(best_afrp[-1])
-            if include_adjusted_performance
-            else f1.index(best_afrp[-1])
-        )
-        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
-        best_f1.append(f1[best_afrp_index])
-        best_bleu1.append(bleu1[best_afrp_index])
-        best_rougeL.append(rougeL[best_afrp_index])
-        best_mtr.append(
-            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
-        )
-        # print(
-        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
-        # )
-        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
-        model_names.append(
-            f"{model} (RP={best_repetition_penalty[-1]})"
-        )  # Add the model name to the list
-    if ref_result is not None:
-        print("ref_result:", ref_result)
-        for model in ref_result.keys():
-            model_names.append(model)
-            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
-            # df = df[df["id"].isin(wikidata_df["id"])]
-            p = df["bleu1"][0]
-            best_bleu1.append(p)
-            r = df["rougeL"][0]
-            best_rougeL.append(r)
-            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
-            best_f1.append(f1)
-            best_afrp.append(f1)
-            best_mtr.append(0)
-    # print("model_names:", model_names)
-    # print("best_f1:", best_f1)
-    # print("best_afrp:", best_afrp)
-    # Create a DataFrame with the statistics
-    data = (
-        pd.DataFrame(
-            {
-                "Model": model_names,
-                "RAP - Perf Score": best_afrp,
-                "Overall Perf Score": best_f1,
-            }
-        )
-        if include_adjusted_performance
-        else pd.DataFrame(
-            {
-                "Model": model_names,
-                "Bleu-1": best_bleu1,
-                "Rouge-L": best_rougeL,
-                "Overall Perf Score": best_f1,
-            }
-        )
-    )
-    # Melt the DataFrame to a long format
-    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
-    # Pivot the DataFrame to a wide format
-    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
-    # make sure the columns are following the order of the models
-    data_pivoted = data_pivoted[model_names]
-    columns = list(data.columns)
-    data_pivoted = data_pivoted.reindex(columns[1:])
-    # Plot the statistics
-    plt.figure(figsize=(10, 6))
-    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
-    plt.title(title)
-    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
-    # Set the rotation of the x-axis labels to 0 degrees
-    plt.xticks(rotation=0)
-    # Format the y-axis to display as percentage
-    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
-    # get the max value of the y-axis
-    a1 = max(best_afrp)
-    a2 = max(best_f1)
-    a3 = max(best_bleu1)
-    a4 = max(best_rougeL)
-    max_value = (
-        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
-    )
-    print("max_value:", max_value)
-    # Set the y-axis limit up to 70%
-    ax.set_ylim(0, max_value)
-    # Add the values above each bar
-    for p in ax.patches:
-        ax.annotate(
-            f"{p.get_height() * 100:.1f}",
-            (p.get_x() + p.get_width() / 2.0, p.get_height()),
-            ha="center",
-            va="bottom",
-            xytext=(0, 10),
-            textcoords="offset points",
-            rotation=90,
-        )
-    plt.show()
-    return data_pivoted, best_mtr
-all_open_source_models = [
-    "gemma-1.1-2b-it",
-    "Phi-3-mini-128k-instruct",
-    "gemma-1.1-7b-it",
-    "Llama-2-7b-chat-hf",
-    "Mistral-7B-Instruct-v0.2",
-    "Meta-Llama-3-8B-Instruct",
-    "Llama-2-13b-chat-hf",
-    "Llama-2-70b-chat-hf",
-    "Meta-Llama-3-70B-Instruct",
-]
-def load_for_repetition_penalty_ms_macro(
-    csv_result_file, repetition_penalty, force_recalculate=False
-):
-    result_file = replace_last(
-        csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
-    )
-    df = load_with_newline_and_repetition_scores(
-        result_file, force_recalculate=force_recalculate
-    )
-    return df
-# MS MACRO
-def plot_performance_scores_ms_macro(
-    result,
-    models=None,
-    title="Performance",
-):
-    if models is None:
-        models = result.keys()
-    for model in models:
-        print(f"model: {model}")
-        df = result[model]["df_overall"]
-        # print(result[model]["df_list_repetition_penalty"][0].describe())
-        # Calculate the statistics
-        bleu1 = list(df["bleu1"])
-        rougeL = list(df["rougeL"])
-        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
-        best_f1 = max(f1)
-        best_f1_index = f1.index(best_f1)
-        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
-            result[model], bleu1, rougeL
-        )
-        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
-        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
-        best_afrp = max(afrp)
-        best_afrp_index = afrp.index(best_afrp)
-        repetition_penalties = list(df["repetition_penalty"])
-        # line plot for precision, recall, f1
-        plt.figure(figsize=(10, 6))
-        plt.axvspan(
-            repetition_penalties[best_f1_index] - 0.01,
-            repetition_penalties[best_f1_index] + 0.01,
-            alpha=0.5,
-            edgecolor="none",
-            facecolor="blue",
-        )
-        plt.axvspan(
-            repetition_penalties[best_afrp_index] - 0.01,
-            repetition_penalties[best_afrp_index] + 0.01,
-            alpha=0.5,
-            edgecolor="none",
-            facecolor="orange",
-        )
-        plt.plot(
-            repetition_penalties,
-            f1,
-            label="Overall Perf Score",
-            marker="D",
-            color="blue",
-        )
-        plt.plot(
-            repetition_penalties,
-            afrp,
-            label="RAP - Perf Score",
-            marker="o",
-            color="orange",
-        )
-        plt.xlabel("Repetition Penalties")
-        plt.ylabel("Score")
-        # plt.xlim(0.99, 1.31)
-        # y in percentage
-        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
-        plt.title(f"{model} {title}")
-        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
-        plt.show()
-def plot_repetition_factors(result, groups):
-    for group in groups:
-        # Plot the statistics
-        plt.figure(figsize=(10, 6))
-        max_value = 0
-        for model in result.keys():
-            if not group in model.lower():
-                continue
-            print(f"model: {model}")
-            df = result[model]["df_overall"]
-            repetition_panelties = [
-                repetition_penalty for repetition_penalty in df["repetition_penalty"]
-            ]
-            mean_score = [
-                df["total_repetitions"].mean()
-                for df in result[model]["df_list_repetition_penalty"]
-            ]
-            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
-            new_max = max(mean_score)
-            if new_max > max_value:
-                max_value = new_max
-        max_value = max_value * 1.05
-        # if max_value < 1.5:
-        #     max_value = 1.5
-        # set ylimit
-        plt.ylim(0, max_value)
-        # show grid
-        plt.grid(True)
-        plt.xlabel("Repetition Penalties")
-        plt.ylabel("Mean Total Repetitions")
-        plt.title("Mean Total Repetitions vs Repetition Penalties")
-        plt.legend()
-        plt.show()
-def plot_repetition_factors_by_group(result, group_filter=None):
-    markers = ["D", "o", "s", "x"]
-    colors = ["blue", "orange", "green", "red"]
-    # Plot the statistics
-    plt.figure(figsize=(10, 6))
-    index = 0
-    max_value = 0
-    for model in result.keys():
-        if group_filter is not None and group_filter not in model:
-            continue
-        print(f"model: {model}")
-        df = result[model]["df_overall"]
-        repetition_panelties = [
-            repetition_penalty for repetition_penalty in df["repetition_penalty"]
-        ]
-        # Calculate the statistics
-        mean_score = [
-            df["total_repetitions"].mean()
-            for df in result[model]["df_list_repetition_penalty"]
-        ]
-        if len(mean_score) != len(repetition_panelties):
-            print(
-                f"model: {model} has different length of repetition penalties and mean score"
-            )
-            print("repetition_panelties:", len(repetition_panelties))
-            print("mean_score:", len(mean_score))
-            continue
-        new_max = max(mean_score)
-        if new_max > max_value:
-            max_value = new_max
-        sns.lineplot(
-            x=repetition_panelties,
-            y=mean_score,
-            label=model,
-            marker=markers[index],
-            color=colors[index],
-        )
-        index += 1
-    max_value = max_value * 1.05
-    # if max_value < 1.5:
-    #     max_value = 1.5
-    # set ylimit
-    plt.ylim(0, max_value)
-    max_value = 0
-    plt.xlabel("Repetition Penalties")
-    plt.ylabel("Mean Total Repetitions")
-    plt.title("Mean Total Repetitions vs Repetition Penalties")
-    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
-    plt.show()
-ms_marco_csv_result_files = [
-    "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_mm.csv",
-    "data/results_v2/gemma-1.1-2b-it(Non-RAG)_mm.csv",
-    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_mm.csv",
-    "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_mm.csv",
-    "data/results_v2/gemma-1.1-7b-it(Non-RAG)_mm.csv",
-    "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_mm.csv",
-    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_mm.csv",
-    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_mm.csv",
-    "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_mm.csv",
-    "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_mm.csv",
-    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_mm.csv",
-    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_mm.csv",
-    "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_mm.csv",
-]
-webqsp_csv_result_files = [
-    "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_wd.csv",
-    "data/results_v2/gemma-1.1-2b-it(Non-RAG)_wd.csv",
-    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_wd.csv",
-    "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_wd.csv",
-    "data/results_v2/gemma-1.1-7b-it(Non-RAG)_wd.csv",
-    "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_wd.csv",
-    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_wd.csv",
-    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_wd.csv",
-    "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_wd.csv",
-    "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_wd.csv",
-    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_wd.csv",
-    "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_wd.csv",
-    "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_wd.csv",
-]
-def calc_rap_scores(
-    result, precision="precision", recall="recall", penalty_function="cubic"
-):
-    newline_score = [
-        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
-    ]
-    repetition_score = [
-        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
-    ]
-    if precision in result["df_list_repetition_penalty"][0].columns:
-        precision = [
-            df[precision].mean() for df in result["df_list_repetition_penalty"]
-        ]
-        recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
-    else:
-        precision = result["df_overall"][precision]
-        recall = result["df_overall"][recall]
-    f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
-    nrr = [
-        1 - (n + r) / s
-        for f, n, r, s in zip(
-            f1, newline_score, repetition_score, result["df_overall"]["answer_len"]
-        )
-    ]
-    rap = [
-        calc_adjusted_performance(f, 1 - n, penalty_function=penalty_function)
-        for f, n in zip(f1, nrr)
-    ]
-    return newline_score, repetition_score, f1, rap, nrr
-def get_model_name(csv_result_file):
-    parts = re.split(r"[_/]", csv_result_file)
-    print(f"parts: {parts}")
-    model_name = parts[3]
-    return model_name
-def load_webqsp_result(
-    csv_result_files, force_recalculate=False, save=False, penalty_function="cubic"
-):
-    result = {}
-    for i, csv_result_file in enumerate(csv_result_files):
-        try:
-            df = pd.read_csv(csv_result_file)
-            model_name = get_model_name(csv_result_file)
-            print(f"\tmodel_name: {model_name}")
-            dfs = [
-                calculate_performance_score(
-                    csv_result_file,
-                    repetition_penalty,
-                    force_recalculate=force_recalculate,
-                )
-                for repetition_penalty in df["repetition_penalty"]
-            ]
-            answer_lens = []
-            for df_rpp in dfs:
-                answer_lens.append(df_rpp["answer_len"].mean())
-            df["answer_len"] = answer_lens
-            result[model_name] = {
-                "df_overall": df,
-                "df_list_repetition_penalty": dfs,
-                "file": csv_result_file,
-            }
-            newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
-                result[model_name], penalty_function=penalty_function
-            )
-            df["newline_score"] = newline_score
-            df["repetition_score"] = repetition_score
-            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
-            df["perf"] = perf
-            df["nrr"] = nrr
-            df["rap"] = rap
-            df["rr"] = df["nrr"].apply(lambda x: 1 - x)
-            df["rrp"] = df["rr"].apply(lambda x: x * 100)
-            if save:
-                df.to_csv(csv_result_file, index=False)
-        except Exception as e:
-            print(f"Error: {e}")
-            traceback.print_exc()
-    return result
-def load_ms_marco_result(
-    csv_result_files,
-    force_recalculate=False,
-    calc_bertscore=True,
-    save=False,
-    penalty_function="cubic",
-):
-    result = {}
-    for csv_result_file in csv_result_files:
-        try:
-            df = pd.read_csv(csv_result_file)
-            model_name = get_model_name(csv_result_file)
-            print(f"\tmodel_name: {model_name}")
-            dfs = [
-                load_for_repetition_penalty_ms_macro(
-                    csv_result_file,
-                    repetition_penalty,
-                    force_recalculate=force_recalculate,
-                )
-                for repetition_penalty in df["repetition_penalty"]
-            ]
-            answer_lens = []
-            for df_rpp in dfs:
-                answer_lens.append(df_rpp["answer_len"].mean())
-            df["answer_len"] = answer_lens
-            col = "bert_score" if calc_bertscore else "meteor"
-            score_unavailable = col not in df.columns
-            if score_unavailable:
-                save = True
-                bert_meteor_scores = []
-                bert_score_references = None
-                for df_rpp in dfs:
-                    if calc_bertscore:
-                        bert_meteor_score = 0
-                        for i, row in df_rpp.iterrows():
-                            answer = row["answer"]
-                            if not isinstance(answer, str):
-                                answer = ""
-                            bert_meteor_score += bert_score.compute(
-                                predictions=[answer],
-                                references=[row["ground_truth"][0]],
-                                lang="en",
-                                model_type="microsoft/deberta-large-mnli",
-                            )["f1"][0]
-                        # get average of bertscore
-                        bert_meteor_score = bert_meteor_score / len(df_rpp)
-                        print(f"bert_score: {bert_meteor_score}")
-                    else:
-                        bert_meteor_score = meteor.compute(
-                            predictions=df_rpp["answer"],
-                            references=df_rpp["ground_truth"],
-                        )["meteor"]
-                    bert_meteor_scores.append(bert_meteor_score)
-                df[col] = bert_meteor_scores
-            result[model_name] = {
-                "df_overall": df,
-                "df_list_repetition_penalty": dfs,
-                "file": csv_result_file,
-            }
-            newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
-                result[model_name],
-                precision=col,
-                recall=col,
-                penalty_function=penalty_function,
-            )
-            df["newline_score"] = newline_score
-            df["repetition_score"] = repetition_score
-            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
-            df["perf"] = perf
-            df["nrr"] = nrr
-            df["rap"] = rap
-            df["rr"] = df["nrr"].apply(lambda x: 1 - x)
-            df["rrp"] = df["rr"].apply(lambda x: x * 100)
-            if save:
-                df.to_csv(csv_result_file, index=False)
-        except Exception as e:
-            print("An error occurred:", e)
-            traceback.print_exc()
-            print(f"csv_result_file: {csv_result_file}")
-    return result

eval_modules/utils.py CHANGED Viewed

@@ -1,174 +1,85 @@
 # -*- coding:utf-8 -*-
 from __future__ import annotations
-import json
-import logging
-import os
-import platform
 import re
-from pathlib import Path
 import evaluate
 import pandas as pd
-import requests
-import torch
-from tqdm import tqdm
-class LogRecord(logging.LogRecord):
-    def getMessage(self):
-        msg = self.msg
-        if self.args:
-            if isinstance(self.args, dict):
-                msg = msg.format(**self.args)
-            else:
-                msg = msg.format(*self.args)
-        return msg
-class Logger(logging.Logger):
-    def makeRecord(
-        self,
-        name,
-        level,
-        fn,
-        lno,
-        msg,
-        args,
-        exc_info,
-        func=None,
-        extra=None,
-        sinfo=None,
-    ):
-        rv = LogRecord(name, level, fn, lno, msg, args, exc_info, func, sinfo)
-        if extra is not None:
-            for key in extra:
-                rv.__dict__[key] = extra[key]
-        return rv
-def init_settings():
-    logging.setLoggerClass(Logger)
-    logging.basicConfig(
-        level=logging.WARNING,
-        format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
-    )
-def remove_extra_spaces(text):
-    return re.sub(" +", " ", text.strip())
-def print_llm_response(llm_response, debug_retrieval=True):
-    answer = llm_response["answer"] if "answer" in llm_response else None
-    if answer is None:
-        answer = llm_response["response"] if "response" in llm_response else None
-    if answer is not None:
-        print("\n\n***Answer:")
-        print(answer)
-    source_documents = (
-        llm_response["source_documents"] if "source_documents" in llm_response else None
-    )
-    if source_documents is None:
-        source_documents = (
-            llm_response["sourceDocs"] if "sourceDocs" in llm_response else None
-        )
-    if debug_retrieval and source_documents is not None:
-        print("\nSources:")
-        for index, source in enumerate(source_documents):
-            metadata = source["metadata"] if "metadata" in source else source.metadata
-            if "page" in metadata:
-                print(f" Page:  {metadata['page']}", end="")
-            print(
-                f" Source {index + 1}: "
-                + str(metadata["url"] if "url" in metadata else metadata["source"])
-            )
-            print(
-                source["page_content"]
-                if "page_content" in source
-                else source.page_content
-            )
-    if "chat_history" in llm_response:
-        print("\nChat History:")
-        print(llm_response["chat_history"])
-def get_device_types():
-    print("Running on: ", platform.platform())
-    print("MPS is", "NOT" if not torch.backends.mps.is_available() else "", "available")
-    print("CUDA is", "NOT" if not torch.cuda.is_available() else "", "available")
-    device_type_available = "cpu"
-    if not torch.backends.mps.is_available():
-        if not torch.backends.mps.is_built():
-            print(
-                "MPS not available because the current PyTorch install was not "
-                "built with MPS enabled."
-            )
-        else:
-            print(
-                "MPS not available because the current MacOS version is not 12.3+ "
-                "and/or you do not have an MPS-enabled device on this machine."
-            )
-    else:
-        device_type_available = "mps"
-    if torch.cuda.is_available():
-        print("CUDA is available, we have found ", torch.cuda.device_count(), " GPU(s)")
-        print(torch.cuda.get_device_name(0))
-        print("CUDA version: " + torch.version.cuda)
-        device_type_available = f"cuda:{torch.cuda.current_device()}"
-    return (
-        os.environ.get("HF_EMBEDDINGS_DEVICE_TYPE") or device_type_available,
-        os.environ.get("HF_PIPELINE_DEVICE_TYPE") or device_type_available,
     )
-def ensure_model_is_downloaded(llm_model_type):
-    if llm_model_type.startswith("gpt4all"):
-        local_path = (
-            os.environ.get("GPT4ALL_J_MODEL_PATH")
-            if llm_model_type == "gpt4all-j"
-            else os.environ.get("GPT4ALL_MODEL_PATH")
-        )
-        url = (
-            os.environ.get("GPT4ALL_J_DOWNLOAD_LINK")
-            if llm_model_type == "gpt4all-j"
-            else os.environ.get("GPT4ALL_DOWNLOAD_LINK")
-        )
-    elif llm_model_type == "llamacpp":
-        local_path = os.environ.get("LLAMACPP_MODEL_PATH")
-        url = os.environ.get("LLAMACPP_DOWNLOAD_LINK")
-    elif llm_model_type == "ctransformers":
-        local_path = os.environ.get("CTRANSFORMERS_MODEL_PATH")
-        url = os.environ.get("CTRANSFORMERS_DOWNLOAD_LINK")
-    else:
-        raise ValueError(f"wrong model typle: {llm_model_type}")
-    path = Path(local_path)
-    if path.is_file():
-        print(f"model: {local_path} exists")
-    else:
-        print(f"downloading model: {local_path} from {url} ...")
-        path.parent.mkdir(parents=True, exist_ok=True)
-        # send a GET request to the URL to download the file. Stream since it's large
-        response = requests.get(url, stream=True)
-        # open the file in binary mode and write the contents of the response to it in chunks
-        # This is a large file, so be prepared to wait.
-        with open(local_path, "wb") as f:
-            for chunk in tqdm(response.iter_content(chunk_size=8192)):
-                if chunk:
-                    f.write(chunk)
-    return local_path
 bleu = evaluate.load("bleu")

 # -*- coding:utf-8 -*-
 from __future__ import annotations
 import re
 import evaluate
 import pandas as pd
+print(f"loading: {__file__}")
+# pattern_non_word_char_repetition = re.compile(r"\s{5,}")
+# pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
+# final version
+pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}")
+pattern_text_repetitions = re.compile(
+    r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE
+)
+# Explanation of the Regex Pattern:
+#   (?P<repeat>.{5}.*?): Captures any sequence of characters with minimal length of 5 and names this group repeat.
+#     .*?: Matches zero or more characters, non-greedily (as few as possible).
+#   (?:[\s\W]+(?P=repeat))+: A non-capturing group that matches one or more repetitions of:
+#     [\s\W]+: One or more whitespace or non-word characters (spaces, punctuation, etc.).
+#     (?P=repeat): A backreference to the named group repeat.
+def del_non_word_char_repetition(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect non-word characters repetition----")
+        count = len(text)
+        text = pattern_non_word_char_repetition.sub("\t", text)
+        count -= len(text)
+        if debug and count:
+            print(f"removed non-word characters repetition: {count}")
+    return text, count
+# final version for repetition detection
+def detect_text_repetitions(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect text repetitions----")
+        matches = pattern_text_repetitions.finditer(text)
+        for match in matches:
+            if debug:
+                print(match)
+                for groupNum in range(0, len(match.groups())):
+                    groupNum = groupNum + 1
+                    print(
+                        "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                            groupNum=groupNum,
+                            start=match.start(groupNum),
+                            end=match.end(groupNum),
+                            group=match.group(groupNum),
+                        )
+                    )
+            start, end = match.span()
+            count += end - start - len(match.group(1))
+    return count
+def detect_repetitions(text, debug=False):
+    if isinstance(text, str) is False:
+        return 0, 0, 0
+    text, count_non_word_char_repetition = del_non_word_char_repetition(
+        text, debug=debug
     )
+    count_text_repetitions = detect_text_repetitions(text, debug=debug)
+    total_repetitions = count_non_word_char_repetition + count_text_repetitions
+    result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions)
+    if debug:
+        print(result)
+    return result
 bleu = evaluate.load("bleu")