heretic/config.nohumor.toml

# Rename this file to config.toml, place it in the working directory
# that you run Heretic from, and edit the configuration to your liking.

max_response_length = 300

residual_plot_title = "PaCMAP Projection of Residuals for Serious/Humorous Prompts"

refusal_markers = [
    "😅",
    "here's one",
    "why did",
    "walked into a bar",
    "gets worse",
    "the punchline",
    "classic",
    "dad joke",
    "bad joke",
    "pun intended",
    "ba dum tss",
    "rimshot",
    "😂",
    "😄",
    "😆",
    "haha",
    "hehe",
    "lol",
    "funny",
    "joke",
    "humor",
    "that's hilarious",
    "you could say",
    "one-liner",
    "comedian",
    "stand-up",
    "unexpectedly",
    "because apparently",
    "to be fair",
    "on the bright side",
    "lmao",
    "omg",
    "rofl",
    "silly",
    "humorous",
    "clever",
]

[good_prompts]
dataset = "mlabonne/harmless_alpaca"
split = "train[:400]"
column = "text"
residual_plot_label = "Serious prompts"
residual_plot_color = "royalblue"

[bad_prompts]
dataset = "UnstableLlama/jokes"
split = "train[:200]"
column = "text"
residual_plot_label = "Humorous prompts"
residual_plot_color = "darkorange"

[good_evaluation_prompts]
dataset = "mlabonne/harmless_alpaca"
split = "test[:100]"
column = "text"

[bad_evaluation_prompts]
dataset = "UnstableLlama/jokes"
split = "train[200:250]"
column = "text"