Spaces:

ludvigolsen
/

plot_confusion_matrix

Running

App Files Files Community

Ludvig commited on Jun 3, 2023

Commit

621a5a4

1 Parent(s): b995dc9

Fixes, styling, improvements

Browse files

Files changed (6) hide show

README.md +5 -12
app.py +109 -92
design.py +8 -16
plot.R +25 -11
text_sections.py +78 -24
utils.py +13 -2

README.md CHANGED Viewed

@@ -3,26 +3,19 @@ title: plot_confusion_matrix
 sdk: docker
 app_file: app.py
 pinned: true
 ---
-# cvms_plot_app
 Streamlit application for plotting a confusion matrix.
-emoji: {{emoji}}
-colorFrom: {{colorFrom}}
-colorTo: {{colorTo}}
 ## TODOs
-- IMPORTANT! Allow specifying which class probabilities are of! (See plot prob_of_class)
-- Allow setting threshold - manual, max J, spec/sens
-- Add bg box around confusion matrix plot as text dissappears on dark mode!
-- ggsave does not use dpi??
 - Allow svg, pdf?
 - Add full reset button (empty cache on different files) - callback?
-- Handle <2 classes in design box (add st.error)
-- Handle classes with spaces in them?
 - Add option to change zero-tile background (e.g. to black for black backgrounds)
 - Add option to format total-count tile in sum tiles

 sdk: docker
 app_file: app.py
 pinned: true
+emoji: 🍀
+colorFrom: fe7120
+colorTo: 8511a5
 ---
+# Plot Confusion Matrix Streamlit Application
 Streamlit application for plotting a confusion matrix.
 ## TODOs
+- ggsave only uses DPI for scaling? We would expect output files to have the given DPI?
 - Allow svg, pdf?
 - Add full reset button (empty cache on different files) - callback?
 - Add option to change zero-tile background (e.g. to black for black backgrounds)
 - Add option to format total-count tile in sum tiles

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from pandas.api.types import is_float_dtype
 from itertools import combinations
 from collections import OrderedDict
-from utils import call_subprocess, clean_string_for_non_alphanumerics
 from data import read_data, read_data_cached, DownloadHeader, generate_data
 from design import design_section
 from text_sections import (
@@ -37,8 +37,6 @@ st.markdown(
 # Create temporary directory
 @st.cache_resource
 def set_tmp_dir():
     """
@@ -157,12 +155,19 @@ elif input_choice == "Upload counts":
             if st.form_submit_button(label="Set columns"):
                 st.session_state["step"] = 2
-                st.session_state["classes"] = sorted(
-                    [
-                        str(c)
-                        for c in st.session_state["count_data"][target_col].unique()
-                    ]
-                )
 # Generate data
 elif input_choice == "Generate":
@@ -283,108 +288,120 @@ elif input_choice == "Enter counts":
         n_col = "N"
 if st.session_state["step"] >= 2:
     if st.session_state["input_type"] == "data":
         # Remove unused columns
         df = df.loc[:, [target_col, prediction_col]]
-        # Ensure targets are strings
-        df[target_col] = df[target_col].astype(str)
-        df[target_col] = df[target_col].apply(lambda x: x.replace(" ", "_"))
-        # Save to tmp directory to allow reading in R script
-        df.to_csv(data_store_path)
-        # Extract unique classes
-        st.session_state["classes"] = sorted([str(c) for c in df[target_col].unique()])
-        predictions_are_probabilities = is_float_dtype(df[prediction_col])
-        if predictions_are_probabilities and len(st.session_state["classes"]) != 2:
-            st.error(
-                "Predictions can only be probabilities in binary classification. "
-                f"Got {len(st.session_state['classes'])} classes."
             )
-        st.subheader("The Data")
-        col1, col2, col3 = st.columns([2, 2, 2])
-        with col2:
-            st.write(df.head(5))
-            st.write(f"{df.shape} (Showing first 5 rows)")
     else:
-        predictions_are_probabilities = False
         st.session_state["count_data"].to_csv(data_store_path)
-    # Check the number of classes
-    num_classes = len(st.session_state["classes"])
-    if num_classes < 2:
-        # TODO Handle better than throwing error?
-        raise ValueError(
-            "Uploaded data must contain 2 or more classes in `Targets column`. "
-            f"Got {num_classes} target classes."
-        )
-    # Section for specifying design settings
-    design_settings, design_ready, selected_classes, prob_of_class = design_section(
-        num_classes=num_classes,
-        predictions_are_probabilities=predictions_are_probabilities,
-        design_settings_store_path=design_settings_store_path,
-    )
-    # design_ready tells us whether to proceed or wait
-    # for user to fix issues
-    if st.session_state["step"] >= 3 and design_ready:
-        DownloadHeader.centered_json_download(
-            data=design_settings,
-            file_name="design_settings.json",
-            label="Download design settings",
-            help="Download the design settings to allow reusing settings in future plots.",
         )
-        st.markdown("---")
-        plotting_args = [
-            "--data_path",
-            f"'{data_store_path}'",
-            "--out_path",
-            f"'{conf_mat_path}'",
-            "--settings_path",
-            f"'{design_settings_store_path}'",
-            "--target_col",
-            f"'{target_col}'",
-            "--prediction_col",
-            f"'{prediction_col}'",
-            "--classes",
-            f"{','.join(selected_classes)}",
-        ]
-        if st.session_state["input_type"] == "counts":
-            # The input data are counts
-            plotting_args += ["--n_col", f"{n_col}", "--data_are_counts"]
-        plotting_args = " ".join(plotting_args)
-        call_subprocess(
-            f"Rscript plot.R {plotting_args}",
-            message="Plotting script",
-            return_output=True,
-            encoding="UTF-8",
-        )
-        DownloadHeader.header_and_image_download(
-            "", filepath=conf_mat_path, label="Download Plot"
-        )
-        col1, col2, col3 = st.columns([2, 8, 2])
-        with col2:
-            image = Image.open(conf_mat_path)
-            st.image(
-                image,
-                caption="Confusion Matrix",
-                clamp=False,
-                channels="RGB",
-                output_format="auto",
             )
 else:
     st.write("Please upload data.")

 from itertools import combinations
 from collections import OrderedDict
+from utils import call_subprocess, clean_string_for_non_alphanumerics, clean_str_column
 from data import read_data, read_data_cached, DownloadHeader, generate_data
 from design import design_section
 from text_sections import (
 # Create temporary directory
 @st.cache_resource
 def set_tmp_dir():
     """
             if st.form_submit_button(label="Set columns"):
                 st.session_state["step"] = 2
+        if st.session_state["step"] >= 2:
+            print(st.session_state["count_data"])
+            # Ensure targets and predictions are clean strings
+            st.session_state["count_data"][target_col] = clean_str_column(
+                st.session_state["count_data"][target_col]
+            )
+            st.session_state["count_data"][prediction_col] = clean_str_column(
+                st.session_state["count_data"][prediction_col]
+            )
+            st.session_state["classes"] = sorted(
+                [c for c in st.session_state["count_data"][target_col].unique()]
+            )
 # Generate data
 elif input_choice == "Generate":
         n_col = "N"
 if st.session_state["step"] >= 2:
+    data_is_ready = False
     if st.session_state["input_type"] == "data":
         # Remove unused columns
         df = df.loc[:, [target_col, prediction_col]]
+        predictions_are_probabilities = is_float_dtype(df[prediction_col])
+        if predictions_are_probabilities:
+            st.error(
+                "Predictions should be the predicted classes - not probabilities. "
+            )
+            data_is_ready = False
+        else:
+            data_is_ready = True
+        if data_is_ready:
+            # Ensure targets and predictions are clean strings
+            df[target_col] = clean_str_column(df[target_col])
+            df[prediction_col] = clean_str_column(df[prediction_col])
+            # Save to tmp directory to allow reading in R script
+            df.to_csv(data_store_path)
+            # Extract unique classes
+            st.session_state["classes"] = sorted(
+                [str(c) for c in df[target_col].unique()]
             )
+            st.subheader("The Data")
+            col1, col2, col3 = st.columns([2, 2, 2])
+            with col2:
+                st.write(df.head(5))
+                st.write(f"{df.shape} (Showing first 5 rows)")
     else:
         st.session_state["count_data"].to_csv(data_store_path)
+        data_is_ready = True
+    if data_is_ready:
+        # Check the number of classes
+        num_classes = len(st.session_state["classes"])
+        if num_classes < 2:
+            # TODO Handle better than throwing error?
+            raise ValueError(
+                "Uploaded data must contain 2 or more classes in `Targets column`. "
+                f"Got {num_classes} target classes."
+            )
+        # Section for specifying design settings
+        design_settings, design_ready, selected_classes = design_section(
+            num_classes=num_classes,
+            design_settings_store_path=design_settings_store_path,
         )
+        # design_ready tells us whether to proceed or wait
+        # for user to fix issues
+        if st.session_state["step"] >= 3 and design_ready:
+            DownloadHeader.centered_json_download(
+                data=design_settings,
+                file_name="design_settings.json",
+                label="Download design settings",
+                help="Download the design settings to allow reusing settings in future plots.",
+            )
+            st.markdown("---")
+            selected_classes_string = ",".join([f"'{c}'" for c in selected_classes])
+            plotting_args = [
+                "--data_path",
+                f"'{data_store_path}'",
+                "--out_path",
+                f"'{conf_mat_path}'",
+                "--settings_path",
+                f"'{design_settings_store_path}'",
+                "--target_col",
+                f"'{target_col}'",
+                "--prediction_col",
+                f"'{prediction_col}'",
+                "--classes",
+                f"{selected_classes_string}",
+            ]
+            if st.session_state["input_type"] == "counts":
+                # The input data are counts
+                plotting_args += ["--n_col", f"{n_col}", "--data_are_counts"]
+            plotting_args = " ".join(plotting_args)
+            call_subprocess(
+                f"Rscript plot.R {plotting_args}",
+                message="Plotting script",
+                return_output=True,
+                encoding="UTF-8",
             )
+            DownloadHeader.header_and_image_download(
+                "", filepath=conf_mat_path, label="Download plot"
+            )
+            col1, col2, col3 = st.columns([2, 8, 2])
+            with col2:
+                st.write(" ")
+                image = Image.open(str(conf_mat_path)[:-3] + "jpg")
+                st.image(
+                    image,
+                    caption="Confusion Matrix",
+                    clamp=False,
+                    channels="RGB",
+                    output_format="auto",
+                )
+                st.write(" ")
+                st.write("Note: The downloadable file has a transparent background.")
 else:
     st.write("Please upload data.")

design.py CHANGED Viewed

@@ -31,7 +31,6 @@ def _add_select_box(
 def design_section(
     num_classes,
-    predictions_are_probabilities,
     design_settings_store_path,
 ):
     output = {}
@@ -80,19 +79,7 @@ def design_section(
                 "of another class is excluded.",
             )
         with col2:
-            prob_of_class = None
-            # Not respected, so disabled for now
-            # if (
-            #     st.session_state["input_type"] == "data"
-            #     and predictions_are_probabilities
-            # ):
-            #     prob_of_class = st.selectbox(
-            #         "Probabilities are of (not working)",
-            #         options=st.session_state["classes"],
-            #         index=1,
-            #     )
-            # else:
-            #     prob_of_class = None
         # Color palette
         output["palette"] = _add_select_box(
@@ -124,9 +111,11 @@ def design_section(
             )
         with col3:
             output["dpi"] = st.number_input(
-                "DPI (not working)",
                 value=get_uploaded_setting(key="dpi", default=320, type_=int),
                 step=10,
             )
         st.write(" ")  # Slightly bigger gap between the two sections
@@ -469,8 +458,11 @@ def design_section(
                     "the sum tiles under **Tiles** >> *Sum tile settings*."
                 )
                 design_ready = False
-    return output, design_ready, selected_classes, prob_of_class
 # defaults: dict,

 def design_section(
     num_classes,
     design_settings_store_path,
 ):
     output = {}
                 "of another class is excluded.",
             )
         with col2:
+            pass
         # Color palette
         output["palette"] = _add_select_box(
             )
         with col3:
             output["dpi"] = st.number_input(
+                "DPI (scaling)",
                 value=get_uploaded_setting(key="dpi", default=320, type_=int),
                 step=10,
+                help="While the output file *currently* won't have this DPI, "
+                "the DPI setting affects scaling of elements. ",
             )
         st.write(" ")  # Slightly bigger gap between the two sections
                     "the sum tiles under **Tiles** >> *Sum tile settings*."
                 )
                 design_ready = False
+            if len(selected_classes) < 2:
+                st.error("At least 2 classes must be selected.")
+                design_ready = False
+    return output, design_ready, selected_classes
 # defaults: dict,

plot.R CHANGED Viewed

@@ -42,10 +42,6 @@ option_list <- list(
             "Comma-separated class names. ",
             "Only these classes will be used - in the specified order."
         )
-    ),
-    make_option(c("--prob_of_class"),
-        type = "character",
-        help = "Name of class that probabilities are of."
     )
 )
@@ -104,10 +100,10 @@ if (isTRUE(dev_mode)) {
     print(df)
 }
-if (!target_col %in% colnames(df)){
     stop("Specified `target_col` not a column in the data.")
 }
-if (!prediction_col %in% colnames(df)){
     stop("Specified `target_col` not a column in the data.")
 }
@@ -157,10 +153,6 @@ if (!isTRUE(data_are_counts)) {
         "multinomial"
     )
-    # TODO : use prob_of_class to ensure probabilities
-    #        are interpreted correctly!!
-    # TODO : Set / calculate threshold
-    # Might need to invert them to get it to work!
     evaluation <- tryCatch(
         {
             cvms::evaluate(
@@ -320,7 +312,29 @@ tryCatch(
         )
     },
     error = function(e) {
-        print(paste0("Failed to ggsave plot to: ", opt$out_path))
         print(e)
         stop(e)
     }

             "Comma-separated class names. ",
             "Only these classes will be used - in the specified order."
         )
     )
 )
     print(df)
 }
+if (!target_col %in% colnames(df)) {
     stop("Specified `target_col` not a column in the data.")
 }
+if (!prediction_col %in% colnames(df)) {
     stop("Specified `target_col` not a column in the data.")
 }
         "multinomial"
     )
     evaluation <- tryCatch(
         {
             cvms::evaluate(
         )
     },
     error = function(e) {
+        print(paste0("png: Failed to ggsave plot to: ", opt$out_path))
+        print(e)
+        stop(e)
+    }
+)
+# Create a jpg version as well
+tryCatch(
+    {
+        ggplot2::ggsave(
+            paste0(substr(
+                opt$out_path,
+                start = 1,
+                stop = nchar(opt$out_path) - 3
+            ), "jpg"),
+            width = design_settings$width,
+            height = design_settings$height,
+            dpi = design_settings$dpi,
+            units = "px"
+        )
+    },
+    error = function(e) {
+        print(paste0("jpg: Failed to ggsave plot to: ", opt$out_path))
         print(e)
         stop(e)
     }

text_sections.py CHANGED Viewed

@@ -1,7 +1,27 @@
 import streamlit as st
 from utils import call_subprocess
 @st.cache_resource
 def get_cvms_version():
     return (
@@ -19,6 +39,27 @@ def get_cvms_version():
     )
 def intro_text():
     col1, col2 = st.columns([8, 2])
     with col1:
@@ -41,18 +82,20 @@ def intro_text():
         st.subheader("Have your data ready?")
         st.markdown(  # TODO: Make A,B, etc. icons
             "Upload a csv file with either: \n\n"
-            "A) **Targets** and **predictions**. \n\n"
-            "B) Existing confusion matrix **counts**. \n\n"
-            "--> Specify the columns to use.\n\n"
-            "--> Press **Generate plot**.\n\n"
         )
     with col2:
         st.subheader("No data to upload?")
         st.markdown(
             "No worries! Either: \n\n"
-            "C) **Input** your counts directly! \n\n"
-            "D) **Generate** some data with **very** easy controls! \n\n"
-            "--> Press **Generate plot**.\n\n"
         )
     st.markdown("""---""")
     st.markdown(
@@ -97,28 +140,38 @@ def upload_counts_text():
     st.subheader("Upload your counts")
     st.write(
         "Plot an existing confusion matrix (counts of target-prediction combinations). "
-        "The application expects a `.csv` file with: \n"
-        "1) A `target classes` column. \n\n"
-        "2) A `predicted classes` column. \n\n"
-        "3) A `combination count` column for the "
-        "combination frequency of 1 and 2. \n\n"
-        "Other columns are currently ignored. "
-        "See example of such a .csv file [here] (TODO). "
     )
 def upload_predictions_text():
     st.subheader("Upload your predictions")
-    st.markdown(
-        "The application expects a `.csv` file with:  \n"
-        "1) A `target` column.  \n"
-        "Targets will be converted into strings. \n\n"
-        "2) A `prediction` column.  \n"
-        "Predictions can be probabilities (binary classification only) or class predictions. \n\n"
-        "Other columns are currently ignored.  \n\n"
-        "You will have the option to select the names of these two columns, so don't "
-        "worry too much about the column names in the uploaded data."
-    )
 def columns_text():
@@ -131,6 +184,7 @@ def columns_text():
 def design_text():
     st.subheader("Design your plot")
     st.write("This is where you customize the design of your confusion matrix plot.")
     st.markdown(
         "The *width* and *height* settings are usually necessary to adjust as they "
         "change the relative size of the elements. Try adjusting 100px at a "

 import streamlit as st
+import pandas as pd
 from utils import call_subprocess
+def insert_arrow():
+    return '<svg xmlns="http://www.w3.org/2000/svg" style="width:25px;" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-6 h-6"><path stroke-linecap="round" stroke-linejoin="round" d="M17.25 8.25L21 12m0 0l-3.75 3.75M21 12H3" /></svg>'
+def insert_chart_icon(choice=0):
+    if choice == 0:
+        return '<svg xmlns="http://www.w3.org/2000/svg" style="width:25px;" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5"><path fill-rule="evenodd" d="M3 3.5A1.5 1.5 0 014.5 2h6.879a1.5 1.5 0 011.06.44l4.122 4.12A1.5 1.5 0 0117 7.622V16.5a1.5 1.5 0 01-1.5 1.5h-11A1.5 1.5 0 013 16.5v-13zM13.25 9a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0v-4.5a.75.75 0 01.75-.75zm-6.5 4a.75.75 0 01.75.75v.5a.75.75 0 01-1.5 0v-.5a.75.75 0 01.75-.75zm4-1.25a.75.75 0 00-1.5 0v2.5a.75.75 0 001.5 0v-2.5z" clip-rule="evenodd" /></svg>'
+    else:
+        return '<svg xmlns="http://www.w3.org/2000/svg" style="width:25px;" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5"><path fill-rule="evenodd" d="M4.5 2A1.5 1.5 0 003 3.5v13A1.5 1.5 0 004.5 18h11a1.5 1.5 0 001.5-1.5V7.621a1.5 1.5 0 00-.44-1.06l-4.12-4.122A1.5 1.5 0 0011.378 2H4.5zm2.25 8.5a.75.75 0 000 1.5h6.5a.75.75 0 000-1.5h-6.5zm0 3a.75.75 0 000 1.5h6.5a.75.75 0 000-1.5h-6.5z" clip-rule="evenodd" /></svg>'
+def insert_edit_icon():
+    return '<svg xmlns="http://www.w3.org/2000/svg" style="width:25px;" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5"><path d="M5.433 13.917l1.262-3.155A4 4 0 017.58 9.42l6.92-6.918a2.121 2.121 0 013 3l-6.92 6.918c-.383.383-.84.685-1.343.886l-3.154 1.262a.5.5 0 01-.65-.65z" /><path d="M3.5 5.75c0-.69.56-1.25 1.25-1.25H10A.75.75 0 0010 3H4.75A2.75 2.75 0 002 5.75v9.5A2.75 2.75 0 004.75 18h9.5A2.75 2.75 0 0017 15.25V10a.75.75 0 00-1.5 0v5.25c0 .69-.56 1.25-1.25 1.25h-9.5c-.69 0-1.25-.56-1.25-1.25v-9.5z" /></svg>'
+def insert_generate_icon():
+    return '<svg xmlns="http://www.w3.org/2000/svg" style="width:25px;" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5"><path fill-rule="evenodd" d="M10 1a.75.75 0 01.75.75v1.5a.75.75 0 01-1.5 0v-1.5A.75.75 0 0110 1zM5.05 3.05a.75.75 0 011.06 0l1.062 1.06A.75.75 0 116.11 5.173L5.05 4.11a.75.75 0 010-1.06zm9.9 0a.75.75 0 010 1.06l-1.06 1.062a.75.75 0 01-1.062-1.061l1.061-1.06a.75.75 0 011.06 0zM3 8a.75.75 0 01.75-.75h1.5a.75.75 0 010 1.5h-1.5A.75.75 0 013 8zm11 0a.75.75 0 01.75-.75h1.5a.75.75 0 010 1.5h-1.5A.75.75 0 0114 8zm-6.828 2.828a.75.75 0 010 1.061L6.11 12.95a.75.75 0 01-1.06-1.06l1.06-1.06a.75.75 0 011.06 0zm3.594-3.317a.75.75 0 00-1.37.364l-.492 6.861a.75.75 0 001.204.65l1.043-.799.985 3.678a.75.75 0 001.45-.388l-.978-3.646 1.292.204a.75.75 0 00.74-1.16l-3.874-5.764z" clip-rule="evenodd" /></svg>'
 @st.cache_resource
 def get_cvms_version():
     return (
     )
+@st.cache_data
+def get_example_counts():
+    return pd.DataFrame(
+        {
+            "Target": ["cl1", "cl2", "cl1", "cl2"],
+            "Prediction": ["cl1", "cl2", "cl2", "cl1"],
+            "N": [12, 10, 3, 5],
+        }
+    )
+@st.cache_data
+def get_example_data():
+    return pd.DataFrame(
+        {
+            "Target": ["cl1", "cl1", "cl2", "cl2", "cl1", "cl1"],
+            "Prediction": ["cl1", "cl2", "cl2", "cl1", "cl1", "cl2"],
+        }
+    )
 def intro_text():
     col1, col2 = st.columns([8, 2])
     with col1:
         st.subheader("Have your data ready?")
         st.markdown(  # TODO: Make A,B, etc. icons
             "Upload a csv file with either: \n\n"
+            f"{insert_chart_icon(1)} **Targets** and **predictions** \n\n"
+            f"{insert_chart_icon(0)} Existing confusion matrix **counts** \n\n"
+            f"{insert_arrow()} Specify the columns to use\n\n"
+            f"{insert_arrow()} Press **Generate plot**\n\n",
+            unsafe_allow_html=True,
         )
     with col2:
         st.subheader("No data to upload?")
         st.markdown(
             "No worries! Either: \n\n"
+            f"{insert_edit_icon()} **Input** your counts directly! \n\n"
+            f"{insert_generate_icon()} **Generate** some data with **very** easy controls! \n\n"
+            f"{insert_arrow()} Press **Generate plot**\n\n",
+            unsafe_allow_html=True,
         )
     st.markdown("""---""")
     st.markdown(
     st.subheader("Upload your counts")
     st.write(
         "Plot an existing confusion matrix (counts of target-prediction combinations). "
     )
+    col1, col2 = st.columns([5, 4])
+    with col1:
+        st.markdown(
+            "The application expects a `.csv` file with: \n"
+            "1) A `target classes` column. \n\n"
+            "2) A `predicted classes` column. \n\n"
+            "3) A `combination count` column for the "
+            "combination frequency of 1 and 2. \n\n"
+            "Other columns are currently ignored. "
+            "In the next step, you will be asked to select the names of these two columns. "
+        )
+    with col2:
+        st.write("Example of such a file:")
+        st.write(get_example_counts())
 def upload_predictions_text():
     st.subheader("Upload your predictions")
+    col1, col2 = st.columns([5, 4])
+    with col1:
+        st.markdown(
+            "The application expects a `.csv` file with:  \n"
+            "1) A `target` column.  \n"
+            "2) A `prediction` column.  \n"
+            "Predictions should be class predictions (not probabilities). \n\n"
+            "Other columns are currently ignored.  \n\n"
+            "In the next step, you will be asked to select the names of these two columns. "
+        )
+    with col2:
+        st.write("Example of such a file:")
+        st.write(get_example_data())
 def columns_text():
 def design_text():
     st.subheader("Design your plot")
     st.write("This is where you customize the design of your confusion matrix plot.")
+    st.markdown("We suggest you go directly to `Generate plot` to see the starting point. Then go back and tweak to your liking!")
     st.markdown(
         "The *width* and *height* settings are usually necessary to adjust as they "
         "change the relative size of the elements. Try adjusting 100px at a "

utils.py CHANGED Viewed

@@ -21,5 +21,16 @@ def call_subprocess(call_, message, return_output=False, encoding="UTF-8"):
 def clean_string_for_non_alphanumerics(s):
-    pattern = re.compile("[\W'_']+")
-    return pattern.sub("", s)

 def clean_string_for_non_alphanumerics(s):
+    # Remove non-alphanumerics (keep spaces)
+    pattern1 = re.compile("[^0-9a-zA-Z\s]+")
+    # Replace multiple spaces with a single space
+    pattern2 = re.compile("\s+")
+    # Apply replacements
+    s = pattern1.sub("", s)
+    s = pattern2.sub(" ", s)
+    # Trim whitespace in start and end
+    return s.strip()
+def clean_str_column(x):
+    return x.astype(str).apply(lambda x: clean_string_for_non_alphanumerics(x))