Spaces:

vialibre
/

edia_full_es

Runtime error

App Files Files Community

nanom commited on Jan 26, 2023

Commit

a03ee8b

1 Parent(s): c991f78

Improvement in the display of the graph axes labels. Generalization of rankSent class. Minor fixes.

Browse files

Files changed (4) hide show

modules/module_BiasExplorer.py +22 -10
modules/module_connection.py +12 -10
modules/module_rankSents.py +29 -25
modules/utils.py +64 -3

modules/module_BiasExplorer.py CHANGED Viewed

@@ -5,7 +5,7 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 from typing import List, Dict, Tuple, Optional, Any
-from modules.utils import normalize, cosine_similarity, project_params, take_two_sides_extreme_sorted
 __all__ = ['WordBiasExplorer', 'WEBiasExplorer2Spaces', 'WEBiasExplorer4Spaces']
@@ -371,9 +371,14 @@ class WEBiasExplorer2Spaces(WordBiasExplorer):
         plt.xticks(np.arange(-most_extream_projection,
                              most_extream_projection + axis_projection_step,
                              axis_projection_step))
-        xlabel = ('← {} {} {} →'.format(self.negative_end,
-                                        ' ' * 20,
-                                        self.positive_end))
         plt.xlabel(xlabel)
         plt.ylabel('Words')
@@ -515,13 +520,20 @@ class WEBiasExplorer4Spaces(WordBiasExplorer):
         for _, row in (projections_df.iterrows()):
             ax.annotate(
                 row['word'], (row['projection_x'], row['projection_y']))
-        x_label = '← {} {} {} →'.format(name_left,
-                                        ' ' * 20,
-                                        name_right)
-        y_label = '← {} {} {} →'.format(name_top,
-                                        ' ' * 20,
-                                        name_bottom)
         plt.xlabel(x_label)
         ax.xaxis.set_label_position('bottom')

 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 from typing import List, Dict, Tuple, Optional, Any
+from modules.utils import normalize, cosine_similarity, project_params, take_two_sides_extreme_sorted, axes_labels_format
 __all__ = ['WordBiasExplorer', 'WEBiasExplorer2Spaces', 'WEBiasExplorer4Spaces']
         plt.xticks(np.arange(-most_extream_projection,
                              most_extream_projection + axis_projection_step,
                              axis_projection_step))
+        xlabel = axes_labels_format(
+            left=self.negative_end,
+            right=self.positive_end,
+            sep=' ' * 20,
+            word_wrap=3
+        )
         plt.xlabel(xlabel)
         plt.ylabel('Words')
         for _, row in (projections_df.iterrows()):
             ax.annotate(
                 row['word'], (row['projection_x'], row['projection_y']))
+        x_label = axes_labels_format(
+            left=name_left,
+            right=name_right,
+            sep=' ' * 20,
+            word_wrap=3
+        )
+        y_label = axes_labels_format(
+            left=name_top,
+            right=name_bottom,
+            sep=' ' * 20,
+            word_wrap=3
+        )
         plt.xlabel(x_label)
         ax.xaxis.set_label_position('bottom')

modules/module_connection.py CHANGED Viewed

@@ -422,11 +422,12 @@ class PhraseBiasExplorerConnector(Connector):
     def rank_sentence_options(
         self,
         sent: str,
-        word_list: str,
         banned_word_list: str,
-        useArticles: bool,
-        usePrepositions: bool,
-        useConjunctions: bool
     ) -> Tuple:
         sent = " ".join(sent.strip().replace("*"," * ").split())
@@ -435,7 +436,7 @@ class PhraseBiasExplorerConnector(Connector):
         if err:
             return err, "", ""
-        word_list = self.parse_words(word_list)
         banned_word_list = self.parse_words(banned_word_list)
         # Save inputs in logs file
@@ -443,16 +444,17 @@ class PhraseBiasExplorerConnector(Connector):
             self.logs_file_name,
             self.headers,
             sent,
-            word_list
         )
         all_plls_scores = self.phrase_bias_explorer.rank(
             sent,
-            word_list,
             banned_word_list,
-            useArticles,
-            usePrepositions,
-            useConjunctions
         )
         all_plls_scores = self.phrase_bias_explorer.Label.compute(all_plls_scores)

     def rank_sentence_options(
         self,
         sent: str,
+        interest_word_list: str,
         banned_word_list: str,
+        exclude_articles: bool,
+        exclude_prepositions: bool,
+        exclude_conjunctions: bool,
+        n_predictions: int=5
     ) -> Tuple:
         sent = " ".join(sent.strip().replace("*"," * ").split())
         if err:
             return err, "", ""
+        interest_word_list = self.parse_words(interest_word_list)
         banned_word_list = self.parse_words(banned_word_list)
         # Save inputs in logs file
             self.logs_file_name,
             self.headers,
             sent,
+            interest_word_list
         )
         all_plls_scores = self.phrase_bias_explorer.rank(
             sent,
+            interest_word_list,
             banned_word_list,
+            exclude_articles,
+            exclude_prepositions,
+            exclude_conjunctions,
+            n_predictions
         )
         all_plls_scores = self.phrase_bias_explorer.Label.compute(all_plls_scores)

modules/module_rankSents.py CHANGED Viewed

@@ -66,13 +66,14 @@ class RankSents:
         return self.errorManager.process(out_msj)
-    def getTop5Predictions(
         self,
         sent: str,
-        banned_wl: List[str],
-        articles: bool,
-        prepositions: bool,
-        conjunctions: bool
     ) -> List[str]:
         sent_masked = sent.replace("*", self.tokenizer.mask_token)
@@ -80,7 +81,8 @@ class RankSents:
             sent_masked,
             add_special_tokens=True,
             return_tensors='pt',
-            return_attention_mask=True, truncation=True
         )
         tk_position_mask = torch.where(inputs['input_ids'][0] == self.tokenizer.mask_token_id)[0].item()
@@ -94,26 +96,26 @@ class RankSents:
         probabilities = outputs[tk_position_mask]
         first_tk_id = torch.argsort(probabilities, descending=True)
-        top5_tks_pred = []
         for tk_id in first_tk_id:
             tk_string = self.tokenizer.decode([tk_id])
-            tk_is_banned = tk_string in banned_wl
             tk_is_punctuation = not tk_string.isalnum()
             tk_is_substring = tk_string.startswith("##")
             tk_is_special = (tk_string in self.tokenizer.all_special_tokens)
-            if articles:
                 tk_is_article = tk_string in self.articles
             else:
                 tk_is_article = False
-            if prepositions:
                 tk_is_prepositions = tk_string in self.prepositions
             else:
                 tk_is_prepositions = False
-            if conjunctions:
                 tk_is_conjunctions = tk_string in self.conjunctions
             else:
                 tk_is_conjunctions = False
@@ -128,39 +130,41 @@ class RankSents:
                                     tk_is_conjunctions
             ])
-            if predictions_is_dessire and len(top5_tks_pred) < 5:
-                top5_tks_pred.append(tk_string)
-            elif len(top5_tks_pred) >= 5:
                 break
-        return top5_tks_pred
     def rank(self,
         sent: str,
-        word_list: List[str]=[],
         banned_word_list: List[str]=[],
-        articles: bool=False,
-        prepositions: bool=False,
-        conjunctions: bool=False
     ) -> Dict[str, float]:
         err = self.errorChecking(sent)
         if err:
             raise Exception(err)
-        if not word_list:
-            word_list = self.getTop5Predictions(
                 sent,
                 banned_word_list,
-                articles,
-                prepositions,
-                conjunctions
             )
         sent_list = []
         sent_list2print = []
-        for word in word_list:
             sent_list.append(sent.replace("*", "<"+word+">"))
             sent_list2print.append(sent.replace("*", "<"+word+">"))

         return self.errorManager.process(out_msj)
+    def getTopPredictions(
         self,
+        n: int,
         sent: str,
+        banned_word_list: List[str],
+        exclude_articles: bool,
+        exclude_prepositions: bool,
+        exclude_conjunctions: bool,
     ) -> List[str]:
         sent_masked = sent.replace("*", self.tokenizer.mask_token)
             sent_masked,
             add_special_tokens=True,
             return_tensors='pt',
+            return_attention_mask=True,
+            truncation=True
         )
         tk_position_mask = torch.where(inputs['input_ids'][0] == self.tokenizer.mask_token_id)[0].item()
         probabilities = outputs[tk_position_mask]
         first_tk_id = torch.argsort(probabilities, descending=True)
+        top_tks_pred = []
         for tk_id in first_tk_id:
             tk_string = self.tokenizer.decode([tk_id])
+            tk_is_banned = tk_string in banned_word_list
             tk_is_punctuation = not tk_string.isalnum()
             tk_is_substring = tk_string.startswith("##")
             tk_is_special = (tk_string in self.tokenizer.all_special_tokens)
+            if exclude_articles:
                 tk_is_article = tk_string in self.articles
             else:
                 tk_is_article = False
+            if exclude_prepositions:
                 tk_is_prepositions = tk_string in self.prepositions
             else:
                 tk_is_prepositions = False
+            if exclude_conjunctions:
                 tk_is_conjunctions = tk_string in self.conjunctions
             else:
                 tk_is_conjunctions = False
                                     tk_is_conjunctions
             ])
+            if predictions_is_dessire and len(top_tks_pred) < n:
+                top_tks_pred.append(tk_string)
+            elif len(top_tks_pred) >= n:
                 break
+        return top_tks_pred
     def rank(self,
         sent: str,
+        interest_word_list: List[str]=[],
         banned_word_list: List[str]=[],
+        exclude_articles: bool=False,
+        exclude_prepositions: bool=False,
+        exclude_conjunctions: bool=False,
+        n_predictions: int=5
     ) -> Dict[str, float]:
         err = self.errorChecking(sent)
         if err:
             raise Exception(err)
+        if not interest_word_list:
+            interest_word_list = self.getTopPredictions(
+                n_predictions,
                 sent,
                 banned_word_list,
+                exclude_articles,
+                exclude_prepositions,
+                exclude_conjunctions
             )
         sent_list = []
         sent_list2print = []
+        for word in interest_word_list:
             sent_list.append(sent.replace("*", "<"+word+">"))
             sent_list2print.append(sent.replace("*", "<"+word+">"))

modules/utils.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import numpy as np
 import pandas as pd
-from datetime import datetime
 import pytz
 class DateLogs:
     def __init__(
         self,
-        zone: str="America/Argentina/Cordoba"
     ) -> None:
         self.time_zone = pytz.timezone(zone)
@@ -80,4 +82,63 @@ def cosine_similarity(
     v_norm = np.linalg.norm(v)
     u_norm = np.linalg.norm(u)
     similarity = v @ u / (v_norm * u_norm)
-    return similarity

 import numpy as np
 import pandas as pd
 import pytz
+from datetime import datetime
+from typing import List
 class DateLogs:
     def __init__(
         self,
+        zone: str = "America/Argentina/Cordoba"
     ) -> None:
         self.time_zone = pytz.timezone(zone)
     v_norm = np.linalg.norm(v)
     u_norm = np.linalg.norm(u)
     similarity = v @ u / (v_norm * u_norm)
+    return similarity
+def axes_labels_format(
+    left: str,
+    right: str,
+    sep: str,
+    word_wrap: int = 4
+) -> str:
+    def sparse(
+        word: str,
+        max_len: int
+    ) -> str:
+        diff = max_len-len(word)
+        rest = diff if diff > 0 else 0
+        return word+" "*rest
+    def gen_block(
+        list_: List[str],
+        n_rows:int,
+        n_cols:int
+    ) -> List[str]:
+        block = []
+        block_row = []
+        for r in range(n_rows):
+            for c in range(n_cols):
+                i = r * n_cols + c
+                w = list_[i] if i <= len(list_) - 1 else ""
+                block_row.append(w)
+                if (i+1) % n_cols == 0:
+                    block.append(block_row)
+                    block_row = []
+        return block
+    # Transform 'string' to list of string
+    l_list = [word.strip() for word in left.split(",") if word.strip() != ""]
+    r_list = [word.strip() for word in right.split(",") if word.strip() != ""]
+    # Get longest word, and longest_list
+    longest_list = max(len(l_list), len(r_list))
+    longest_word = len(max( max(l_list, key=len), max(r_list, key=len)))
+    # Creation of word blocks for each list
+    n_rows =  (longest_list // word_wrap) if longest_list % word_wrap == 0 else (longest_list // word_wrap) + 1
+    n_cols = word_wrap
+    l_block = gen_block(l_list, n_rows, n_cols)
+    r_block = gen_block(r_list, n_rows, n_cols)
+    # Transform list of list to sparse string
+    labels = ""
+    for i,(l,r) in enumerate(zip(l_block, r_block)):
+        line = ' '.join([sparse(w, longest_word) for w in l]) + sep + \
+                ' '.join([sparse(w, longest_word) for w in r])
+        labels += f"← {line} →\n" if i==0 else f"  {line}  \n"
+    return labels