Module skm_pyutils.stats
Statistics functions and paper reporting.
Expand source code
"""Statistics functions and paper reporting."""
import matplotlib.pyplot as plt
import numpy as np
import pingouin
import seaborn as sns
from skm_pyutils.plot import UnicodeGrabber
from skm_pyutils.table import list_to_df
def plot_dists(x, y, ax=None, **fmt_kwargs):
"""Plot distrubtions of two arrays."""
palette = fmt_kwargs.get("palette", "dark")
context = fmt_kwargs.get("context", "paper")
group1_name = fmt_kwargs.get("group1", "1")
group2_name = fmt_kwargs.get("group2", "2")
vname = fmt_kwargs.get("value", "values")
sns.set_palette(palette)
if context == "paper":
sns.set_context(
"paper", font_scale=1.4, rc={"lines.linewidth": 3.2},
)
else:
sns.set_context(context)
if ax is None:
figure, ax = plt.subplots()
else:
figure = None
despine = fmt_kwargs.get("despine", True)
trim = fmt_kwargs.get("trim", True)
offset = fmt_kwargs.get("offset", None)
df_list = []
for val in x:
df_list.append([val, group1_name])
for val in y:
df_list.append([val, group2_name])
df = list_to_df(df_list, headers=[vname, "group"])
sns.kdeplot(data=df, x=vname, hue="group", multiple="stack", ax=ax)
ax.set_xlabel(vname)
if despine:
sns.despine(offset=offset, trim=trim)
return figure
def plot_corr(x, y, ax=None, **fmt_kwargs):
"""Plot correlation between two arrays."""
value1_name = fmt_kwargs.get("value1", "1")
value2_name = fmt_kwargs.get("value2", "2")
palette = fmt_kwargs.get("palette", "dark")
context = fmt_kwargs.get("context", "paper")
sns.set_palette(palette)
if context == "paper":
sns.set_context(
"paper", font_scale=1.4, rc={"lines.linewidth": 3.2},
)
else:
sns.set_context(context)
if ax is None:
figure, ax = plt.subplots()
else:
figure = None
despine = fmt_kwargs.get("despine", True)
trim = fmt_kwargs.get("trim", True)
offset = fmt_kwargs.get("offset", None)
sns.regplot(x=x, y=y, ax=ax, truncate=False, order=1)
extents_x = (0.97 * np.min(x), np.max(x) * 1.03)
extents_y = (0.97 * np.min(y), np.max(y) * 1.03)
ax.set_xlim(extents_x)
ax.set_ylim(extents_y)
ax.set_xlabel(value1_name)
ax.set_ylabel(value2_name)
if despine:
sns.despine(offset=offset, trim=trim)
return figure
def corr(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs):
"""
Compute correlation between x and y.
Based on the method passed as a keyword arg. By default Pearsons.
Also returns a formatted string representation.
Parameters
----------
x : array_like
First set of observations.
y: array_like
Second set of observations.
fmt_kwargs : dict, optional
A dictionary of kwargs to control the formatting.
value - the name of the values being tested
unit - the unit of the values being tested
group1 - the name of x
group2 - the name of y
signif - the significance level (float)
n_decimals - the number of decimal places to print (int)
n_pdecimals - the number of decimal places to print for p (int)
show_quartiles - include quartiles in report (bool)
do_print - print the report string (bool)
do_plot : bool, optional
Whether or not to plot the result.
ax : axes object, optional
An axes to plot into.
**kwargs : keyword arguments
These are passed to pingouin.corr
Of particular is method - the correlation to run.
Returns
-------
dict with keys:
"results" : pd.DataFrame
The dataframe of results
"output" : str
A string to describe the test result for reporting
"figure" : matplotlib.pyplot.figure or None
The returned figure plotted into.
See also
--------
pinouin.corr
"""
if fmt_kwargs is None:
fmt_kwargs = {}
group = fmt_kwargs.get("group", "")
if group != "":
group = " " + group
unit_name = fmt_kwargs.get("unit", "")
if unit_name != "":
unit_name = " " + unit_name + ", "
else:
unit_name = ", "
value1_name = fmt_kwargs.get("value1", "1")
value2_name = fmt_kwargs.get("value2", "2")
signif_level = fmt_kwargs.get("signif", 0.05)
n_decimals = fmt_kwargs.get("n_decimals", 2)
n_pdecimals = fmt_kwargs.get("n_pdecimals", 3)
show_quartiles = fmt_kwargs.get("show_quartiles", True)
do_print = fmt_kwargs.get("do_print", True)
sided = kwargs.get("alternative", "two-sided")
method = kwargs.get("method", "pearson")
results_df = pingouin.corr(x, y, **kwargs)
n = results_df["n"].values[0]
r = np.round(results_df["r"].values[0], n_decimals)
P = np.round(results_df["p-val"].values[0], n_pdecimals)
power = np.round(results_df["power"].values[0], n_decimals)
ci = np.round(np.array(results_df["CI95%"].values[0]), n_decimals)
lin_reg_df = pingouin.linear_regression(x, y, as_dataframe=False)
lm_adjr2 = np.round(lin_reg_df["adj_r2"], n_decimals)
if method == "pearson":
co_eff_name = "r"
corr_name = "Pearson"
elif method == "spearman":
co_eff_name = "\u03A1"
corr_name = "spearman"
elif method == "kendall":
co_eff_name = "\u03A4"
corr_name = "Kendall"
if r < 0:
type_corr = "negative"
if r == 0:
type_corr = "no"
if r > 0:
type_corr = "positive"
if sided == "two-sided":
tailed = "(two-tailed)"
else:
tailed = "(one-tailed)"
if P < signif_level:
differ_str = "was significant"
else:
differ_str = "was not significant"
if show_quartiles:
result_str = f"There was a {type_corr} {corr_name} correlation of {co_eff_name} = {r} [{ci[0]}, {ci[1]}] 95% CI"
else:
result_str = (
f"There was a {type_corr} {corr_name} correlation of {co_eff_name} = {r}"
)
relation_str = f" between {value1_name} and {value2_name}{group}"
stats_str = f"; this {differ_str} (\u0070 = {P}, N = {n}, power = {power} {tailed}"
pow2 = UnicodeGrabber.get("pow2")
lin_reg_str = f", linear regression R{pow2} = {lm_adjr2})"
final_str = result_str + relation_str + stats_str + lin_reg_str
if do_print:
print(final_str)
figure = None
if do_plot:
figure = plot_corr(x, y, ax=ax, **fmt_kwargs)
results = {
"results": results_df,
"output": final_str,
"figure": figure,
}
return results
def mwu(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs):
"""
Compute the Mann-Whitney U Test.
This is the independent t-test non-parametric version.
Also returns a formatted string for paper reporting.
Parameters
----------
x : array_like
First set of observations.
y: array_like
Second set of observations.
fmt_kwargs : dict, optional
A dictionary of kwargs to control the formatting.
value - the name of the values being tested
unit - the unit of the values being tested
group1 - the name of x
group2 - the name of y
signif - the significance level (float)
n_decimals - the number of decimal places to print (int)
n_pdecimals - the number of decimal places to print for p (int)
show_quartiles - include quartiles in report (bool)
do_print - print the report string (bool)
do_plot : bool, optional
Whether or not to plot the result.
ax : axes object, optional
An axes to plot into.
**kwargs : keyword arguments
These are passed to pingouin.corr
Returns
-------
dict with keys:
"results" : pd.DataFrame
The dataframe of results
"output" : str
A string to describe the test result for reporting
"figure" : matplotlib.pyplot.figure or None
The returned figure plotted into.
See also
--------
pinouin.mwu
scipy.stats.mannwhitneyu
"""
if fmt_kwargs is None:
fmt_kwargs = {}
vname = fmt_kwargs.get("value", "values")
unit_name = fmt_kwargs.get("unit", "")
if unit_name != "":
unit_name = " " + unit_name + ", "
else:
unit_name = ", "
group1_name = fmt_kwargs.get("group1", "1")
group2_name = fmt_kwargs.get("group2", "2")
signif_level = fmt_kwargs.get("signif", 0.05)
n_decimals = fmt_kwargs.get("n_decimals", 2)
n_pdecimals = fmt_kwargs.get("n_pdecimals", 3)
show_quartiles = fmt_kwargs.get("show_quartiles", True)
do_print = fmt_kwargs.get("do_print", True)
sided = kwargs.get("alternative", "two-sided")
results_df = pingouin.mwu(x, y, **kwargs)
U = results_df["U-val"].values[0]
P = np.round(results_df["p-val"].values[0], n_pdecimals)
cl = np.round(results_df["CLES"].values[0], n_decimals)
median1 = np.round(np.nanmedian(x), n_decimals)
lowerq1, higherq1 = np.round(np.nanpercentile(x, [25, 75]), n_decimals)
lowerq2, higherq2 = np.round(np.nanpercentile(y, [25, 75]), n_decimals)
median2 = np.round(np.nanmedian(y), n_decimals)
sample_size1 = len(x)
sample_size2 = len(y)
n1 = "n" + UnicodeGrabber.to_sub(1)
n2 = "n" + UnicodeGrabber.to_sub(2)
if sample_size1 == sample_size2:
sample_str = f"{n1} = {n2} = {sample_size1}"
else:
sample_str = f"{n1} = {sample_size1}, {n2} = {sample_size2}"
stats_str = (
"(Mann-Whitney "
+ "\u0055"
+ f" = {U}, CLES = {cl}, {sample_str}, "
+ "\u0070"
+ f" = {P}"
)
if sided == "two-sided":
stats_str += " two-tailed)."
else:
stats_str += " one-tailed)."
if P < signif_level:
differ_str = "differed significantly"
else:
differ_str = "did not differ significantly"
if show_quartiles:
results_str = (
f"Median [quartiles] {vname} in groups {group1_name} and {group2_name} were "
+ f"{median1} [{lowerq1}, {higherq1}] and {median2} [{lowerq2}, {higherq2}]"
f"{unit_name}respectively; "
f"the distributions in the two groups {differ_str} {stats_str}"
)
else:
results_str = (
f"Median {vname} in groups {group1_name} and {group2_name} were "
+ f"{median1} and {median2}"
f"{unit_name}respectively; "
f"the distributions in the two groups {differ_str} {stats_str}"
)
if do_print:
print(results_str)
figure = None
if do_plot:
figure = plot_dists(x, y, ax=ax, **fmt_kwargs)
results = {
"results": results_df,
"output": results_str,
"figure": figure,
}
return results
def wilcoxon(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs):
"""
Compute the wilcoxon signed-rank test.
This is the non-parametric paired t-test.
Also returns a formatted string for paper reporting.
Parameters
----------
x : array_like
First set of observations.
y: array_like
Second set of observations.
fmt_kwargs : dict, optional
A dictionary of kwargs to control the formatting.
value - the name of the values being tested
unit - the unit of the values being tested
group1 - the name of x
group2 - the name of y
signif - the significance level (float)
n_decimals - the number of decimal places to print (int)
n_pdecimals - the number of decimal places to print for p (int)
show_quartiles - include quartiles in report (bool)
do_print - print the report string (bool)
do_plot : bool, optional
Whether or not to plot the result.
ax : axes object, optional
An axes to plot into.
**kwargs : keyword arguments
These are passed to pingouin.corr
Returns
-------
dict with keys:
"results" : pd.DataFrame
The dataframe of results
"output" : str
A string to describe the test result for reporting
"figure" : matplotlib.pyplot.figure or None
The returned figure plotted into.
See also
--------
pinouin.wilcoxon
scipy.stats.wilcoxon
"""
if fmt_kwargs is None:
fmt_kwargs = {}
vname = fmt_kwargs.get("value", "values")
unit_name = fmt_kwargs.get("unit", "")
if unit_name != "":
unit_name = " " + unit_name + ", "
else:
unit_name = ", "
group1_name = fmt_kwargs.get("group1", "1")
group2_name = fmt_kwargs.get("group2", "2")
signif_level = fmt_kwargs.get("signif", 0.05)
n_decimals = fmt_kwargs.get("n_decimals", 2)
n_pdecimals = fmt_kwargs.get("n_pdecimals", 3)
show_quartiles = fmt_kwargs.get("show_quartiles", True)
do_print = fmt_kwargs.get("do_print", True)
sided = kwargs.get("alternative", "two-sided")
results_df = pingouin.wilcoxon(x, y, **kwargs)
U = results_df["W-val"].values[0]
P = np.round(results_df["p-val"].values[0], n_pdecimals)
cl = np.round(results_df["CLES"].values[0], n_decimals)
median1 = np.round(np.nanmedian(x), n_decimals)
lowerq1, higherq1 = np.round(np.nanpercentile(x, [25, 75]), n_decimals)
lowerq2, higherq2 = np.round(np.nanpercentile(y, [25, 75]), n_decimals)
median2 = np.round(np.nanmedian(y), n_decimals)
sample_size1 = len(x)
sample_size2 = len(y)
n1 = "n" + UnicodeGrabber.to_sub(1)
n2 = "n" + UnicodeGrabber.to_sub(2)
if sample_size1 == sample_size2:
sample_str = f"{n1} = {n2} = {sample_size1}"
else:
sample_str = f"{n1} = {sample_size1}, {n2} = {sample_size2}"
stats_str = (
"(Wilcoxon signed-rank "
+ "W"
+ f" = {U}, CLES = {cl}, {sample_str}, "
+ "\u0070"
+ f" = {P}"
)
if sided == "two-sided":
stats_str += " two-tailed)."
else:
stats_str += " one-tailed)."
if P < signif_level:
differ_str = "differed significantly"
else:
differ_str = "did not differ significantly"
if show_quartiles:
results_str = (
f"Median [quartiles] {vname} in groups {group1_name} and {group2_name} were "
+ f"{median1} [{lowerq1}, {higherq1}] and {median2} [{lowerq2}, {higherq2}]"
f"{unit_name}respectively; "
f"the distributions in the two groups {differ_str} {stats_str}"
)
else:
results_str = (
f"Median {vname} in groups {group1_name} and {group2_name} were "
+ f"{median1} and {median2}"
f"{unit_name}respectively; "
f"the distributions in the two groups {differ_str} {stats_str}"
)
if do_print:
print(results_str)
figure = None
if do_plot:
figure = plot_dists(x, y, ax=ax, **fmt_kwargs)
results = {
"results": results_df,
"output": results_str,
"figure": figure,
}
return results
Functions
def corr(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs)
-
Compute correlation between x and y.
Based on the method passed as a keyword arg. By default Pearsons. Also returns a formatted string representation.
Parameters
x
:array_like
- First set of observations.
y
:array_like
- Second set of observations.
fmt_kwargs
:dict
, optional- A dictionary of kwargs to control the formatting. value - the name of the values being tested unit - the unit of the values being tested group1 - the name of x group2 - the name of y signif - the significance level (float) n_decimals - the number of decimal places to print (int) n_pdecimals - the number of decimal places to print for p (int) show_quartiles - include quartiles in report (bool) do_print - print the report string (bool)
do_plot
:bool
, optional- Whether or not to plot the result.
ax
:axes object
, optional- An axes to plot into.
**kwargs
:keyword arguments
- These are passed to pingouin.corr Of particular is method - the correlation to run.
Returns
dict with keys:
- "results" : pd.DataFrame The dataframe of results "output" : str A string to describe the test result for reporting "figure" : matplotlib.pyplot.figure or None The returned figure plotted into.
See Also
pinouin.corr
Expand source code
def corr(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs): """ Compute correlation between x and y. Based on the method passed as a keyword arg. By default Pearsons. Also returns a formatted string representation. Parameters ---------- x : array_like First set of observations. y: array_like Second set of observations. fmt_kwargs : dict, optional A dictionary of kwargs to control the formatting. value - the name of the values being tested unit - the unit of the values being tested group1 - the name of x group2 - the name of y signif - the significance level (float) n_decimals - the number of decimal places to print (int) n_pdecimals - the number of decimal places to print for p (int) show_quartiles - include quartiles in report (bool) do_print - print the report string (bool) do_plot : bool, optional Whether or not to plot the result. ax : axes object, optional An axes to plot into. **kwargs : keyword arguments These are passed to pingouin.corr Of particular is method - the correlation to run. Returns ------- dict with keys: "results" : pd.DataFrame The dataframe of results "output" : str A string to describe the test result for reporting "figure" : matplotlib.pyplot.figure or None The returned figure plotted into. See also -------- pinouin.corr """ if fmt_kwargs is None: fmt_kwargs = {} group = fmt_kwargs.get("group", "") if group != "": group = " " + group unit_name = fmt_kwargs.get("unit", "") if unit_name != "": unit_name = " " + unit_name + ", " else: unit_name = ", " value1_name = fmt_kwargs.get("value1", "1") value2_name = fmt_kwargs.get("value2", "2") signif_level = fmt_kwargs.get("signif", 0.05) n_decimals = fmt_kwargs.get("n_decimals", 2) n_pdecimals = fmt_kwargs.get("n_pdecimals", 3) show_quartiles = fmt_kwargs.get("show_quartiles", True) do_print = fmt_kwargs.get("do_print", True) sided = kwargs.get("alternative", "two-sided") method = kwargs.get("method", "pearson") results_df = pingouin.corr(x, y, **kwargs) n = results_df["n"].values[0] r = np.round(results_df["r"].values[0], n_decimals) P = np.round(results_df["p-val"].values[0], n_pdecimals) power = np.round(results_df["power"].values[0], n_decimals) ci = np.round(np.array(results_df["CI95%"].values[0]), n_decimals) lin_reg_df = pingouin.linear_regression(x, y, as_dataframe=False) lm_adjr2 = np.round(lin_reg_df["adj_r2"], n_decimals) if method == "pearson": co_eff_name = "r" corr_name = "Pearson" elif method == "spearman": co_eff_name = "\u03A1" corr_name = "spearman" elif method == "kendall": co_eff_name = "\u03A4" corr_name = "Kendall" if r < 0: type_corr = "negative" if r == 0: type_corr = "no" if r > 0: type_corr = "positive" if sided == "two-sided": tailed = "(two-tailed)" else: tailed = "(one-tailed)" if P < signif_level: differ_str = "was significant" else: differ_str = "was not significant" if show_quartiles: result_str = f"There was a {type_corr} {corr_name} correlation of {co_eff_name} = {r} [{ci[0]}, {ci[1]}] 95% CI" else: result_str = ( f"There was a {type_corr} {corr_name} correlation of {co_eff_name} = {r}" ) relation_str = f" between {value1_name} and {value2_name}{group}" stats_str = f"; this {differ_str} (\u0070 = {P}, N = {n}, power = {power} {tailed}" pow2 = UnicodeGrabber.get("pow2") lin_reg_str = f", linear regression R{pow2} = {lm_adjr2})" final_str = result_str + relation_str + stats_str + lin_reg_str if do_print: print(final_str) figure = None if do_plot: figure = plot_corr(x, y, ax=ax, **fmt_kwargs) results = { "results": results_df, "output": final_str, "figure": figure, } return results
def mwu(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs)
-
Compute the Mann-Whitney U Test.
This is the independent t-test non-parametric version. Also returns a formatted string for paper reporting.
Parameters
x
:array_like
- First set of observations.
y
:array_like
- Second set of observations.
fmt_kwargs
:dict
, optional- A dictionary of kwargs to control the formatting. value - the name of the values being tested unit - the unit of the values being tested group1 - the name of x group2 - the name of y signif - the significance level (float) n_decimals - the number of decimal places to print (int) n_pdecimals - the number of decimal places to print for p (int) show_quartiles - include quartiles in report (bool) do_print - print the report string (bool)
do_plot
:bool
, optional- Whether or not to plot the result.
ax
:axes object
, optional- An axes to plot into.
**kwargs
:keyword arguments
- These are passed to pingouin.corr
Returns
dict with keys:
- "results" : pd.DataFrame The dataframe of results "output" : str A string to describe the test result for reporting "figure" : matplotlib.pyplot.figure or None The returned figure plotted into.
See Also
pinouin.mwu
scipy.stats.mannwhitneyu
Expand source code
def mwu(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs): """ Compute the Mann-Whitney U Test. This is the independent t-test non-parametric version. Also returns a formatted string for paper reporting. Parameters ---------- x : array_like First set of observations. y: array_like Second set of observations. fmt_kwargs : dict, optional A dictionary of kwargs to control the formatting. value - the name of the values being tested unit - the unit of the values being tested group1 - the name of x group2 - the name of y signif - the significance level (float) n_decimals - the number of decimal places to print (int) n_pdecimals - the number of decimal places to print for p (int) show_quartiles - include quartiles in report (bool) do_print - print the report string (bool) do_plot : bool, optional Whether or not to plot the result. ax : axes object, optional An axes to plot into. **kwargs : keyword arguments These are passed to pingouin.corr Returns ------- dict with keys: "results" : pd.DataFrame The dataframe of results "output" : str A string to describe the test result for reporting "figure" : matplotlib.pyplot.figure or None The returned figure plotted into. See also -------- pinouin.mwu scipy.stats.mannwhitneyu """ if fmt_kwargs is None: fmt_kwargs = {} vname = fmt_kwargs.get("value", "values") unit_name = fmt_kwargs.get("unit", "") if unit_name != "": unit_name = " " + unit_name + ", " else: unit_name = ", " group1_name = fmt_kwargs.get("group1", "1") group2_name = fmt_kwargs.get("group2", "2") signif_level = fmt_kwargs.get("signif", 0.05) n_decimals = fmt_kwargs.get("n_decimals", 2) n_pdecimals = fmt_kwargs.get("n_pdecimals", 3) show_quartiles = fmt_kwargs.get("show_quartiles", True) do_print = fmt_kwargs.get("do_print", True) sided = kwargs.get("alternative", "two-sided") results_df = pingouin.mwu(x, y, **kwargs) U = results_df["U-val"].values[0] P = np.round(results_df["p-val"].values[0], n_pdecimals) cl = np.round(results_df["CLES"].values[0], n_decimals) median1 = np.round(np.nanmedian(x), n_decimals) lowerq1, higherq1 = np.round(np.nanpercentile(x, [25, 75]), n_decimals) lowerq2, higherq2 = np.round(np.nanpercentile(y, [25, 75]), n_decimals) median2 = np.round(np.nanmedian(y), n_decimals) sample_size1 = len(x) sample_size2 = len(y) n1 = "n" + UnicodeGrabber.to_sub(1) n2 = "n" + UnicodeGrabber.to_sub(2) if sample_size1 == sample_size2: sample_str = f"{n1} = {n2} = {sample_size1}" else: sample_str = f"{n1} = {sample_size1}, {n2} = {sample_size2}" stats_str = ( "(Mann-Whitney " + "\u0055" + f" = {U}, CLES = {cl}, {sample_str}, " + "\u0070" + f" = {P}" ) if sided == "two-sided": stats_str += " two-tailed)." else: stats_str += " one-tailed)." if P < signif_level: differ_str = "differed significantly" else: differ_str = "did not differ significantly" if show_quartiles: results_str = ( f"Median [quartiles] {vname} in groups {group1_name} and {group2_name} were " + f"{median1} [{lowerq1}, {higherq1}] and {median2} [{lowerq2}, {higherq2}]" f"{unit_name}respectively; " f"the distributions in the two groups {differ_str} {stats_str}" ) else: results_str = ( f"Median {vname} in groups {group1_name} and {group2_name} were " + f"{median1} and {median2}" f"{unit_name}respectively; " f"the distributions in the two groups {differ_str} {stats_str}" ) if do_print: print(results_str) figure = None if do_plot: figure = plot_dists(x, y, ax=ax, **fmt_kwargs) results = { "results": results_df, "output": results_str, "figure": figure, } return results
def plot_corr(x, y, ax=None, **fmt_kwargs)
-
Plot correlation between two arrays.
Expand source code
def plot_corr(x, y, ax=None, **fmt_kwargs): """Plot correlation between two arrays.""" value1_name = fmt_kwargs.get("value1", "1") value2_name = fmt_kwargs.get("value2", "2") palette = fmt_kwargs.get("palette", "dark") context = fmt_kwargs.get("context", "paper") sns.set_palette(palette) if context == "paper": sns.set_context( "paper", font_scale=1.4, rc={"lines.linewidth": 3.2}, ) else: sns.set_context(context) if ax is None: figure, ax = plt.subplots() else: figure = None despine = fmt_kwargs.get("despine", True) trim = fmt_kwargs.get("trim", True) offset = fmt_kwargs.get("offset", None) sns.regplot(x=x, y=y, ax=ax, truncate=False, order=1) extents_x = (0.97 * np.min(x), np.max(x) * 1.03) extents_y = (0.97 * np.min(y), np.max(y) * 1.03) ax.set_xlim(extents_x) ax.set_ylim(extents_y) ax.set_xlabel(value1_name) ax.set_ylabel(value2_name) if despine: sns.despine(offset=offset, trim=trim) return figure
def plot_dists(x, y, ax=None, **fmt_kwargs)
-
Plot distrubtions of two arrays.
Expand source code
def plot_dists(x, y, ax=None, **fmt_kwargs): """Plot distrubtions of two arrays.""" palette = fmt_kwargs.get("palette", "dark") context = fmt_kwargs.get("context", "paper") group1_name = fmt_kwargs.get("group1", "1") group2_name = fmt_kwargs.get("group2", "2") vname = fmt_kwargs.get("value", "values") sns.set_palette(palette) if context == "paper": sns.set_context( "paper", font_scale=1.4, rc={"lines.linewidth": 3.2}, ) else: sns.set_context(context) if ax is None: figure, ax = plt.subplots() else: figure = None despine = fmt_kwargs.get("despine", True) trim = fmt_kwargs.get("trim", True) offset = fmt_kwargs.get("offset", None) df_list = [] for val in x: df_list.append([val, group1_name]) for val in y: df_list.append([val, group2_name]) df = list_to_df(df_list, headers=[vname, "group"]) sns.kdeplot(data=df, x=vname, hue="group", multiple="stack", ax=ax) ax.set_xlabel(vname) if despine: sns.despine(offset=offset, trim=trim) return figure
def wilcoxon(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs)
-
Compute the wilcoxon signed-rank test.
This is the non-parametric paired t-test. Also returns a formatted string for paper reporting.
Parameters
x
:array_like
- First set of observations.
y
:array_like
- Second set of observations.
fmt_kwargs
:dict
, optional- A dictionary of kwargs to control the formatting. value - the name of the values being tested unit - the unit of the values being tested group1 - the name of x group2 - the name of y signif - the significance level (float) n_decimals - the number of decimal places to print (int) n_pdecimals - the number of decimal places to print for p (int) show_quartiles - include quartiles in report (bool) do_print - print the report string (bool)
do_plot
:bool
, optional- Whether or not to plot the result.
ax
:axes object
, optional- An axes to plot into.
**kwargs
:keyword arguments
- These are passed to pingouin.corr
Returns
dict with keys:
- "results" : pd.DataFrame The dataframe of results "output" : str A string to describe the test result for reporting "figure" : matplotlib.pyplot.figure or None The returned figure plotted into.
See Also
pinouin.wilcoxon
scipy.stats.wilcoxon
Expand source code
def wilcoxon(x, y, fmt_kwargs=None, do_plot=False, ax=None, **kwargs): """ Compute the wilcoxon signed-rank test. This is the non-parametric paired t-test. Also returns a formatted string for paper reporting. Parameters ---------- x : array_like First set of observations. y: array_like Second set of observations. fmt_kwargs : dict, optional A dictionary of kwargs to control the formatting. value - the name of the values being tested unit - the unit of the values being tested group1 - the name of x group2 - the name of y signif - the significance level (float) n_decimals - the number of decimal places to print (int) n_pdecimals - the number of decimal places to print for p (int) show_quartiles - include quartiles in report (bool) do_print - print the report string (bool) do_plot : bool, optional Whether or not to plot the result. ax : axes object, optional An axes to plot into. **kwargs : keyword arguments These are passed to pingouin.corr Returns ------- dict with keys: "results" : pd.DataFrame The dataframe of results "output" : str A string to describe the test result for reporting "figure" : matplotlib.pyplot.figure or None The returned figure plotted into. See also -------- pinouin.wilcoxon scipy.stats.wilcoxon """ if fmt_kwargs is None: fmt_kwargs = {} vname = fmt_kwargs.get("value", "values") unit_name = fmt_kwargs.get("unit", "") if unit_name != "": unit_name = " " + unit_name + ", " else: unit_name = ", " group1_name = fmt_kwargs.get("group1", "1") group2_name = fmt_kwargs.get("group2", "2") signif_level = fmt_kwargs.get("signif", 0.05) n_decimals = fmt_kwargs.get("n_decimals", 2) n_pdecimals = fmt_kwargs.get("n_pdecimals", 3) show_quartiles = fmt_kwargs.get("show_quartiles", True) do_print = fmt_kwargs.get("do_print", True) sided = kwargs.get("alternative", "two-sided") results_df = pingouin.wilcoxon(x, y, **kwargs) U = results_df["W-val"].values[0] P = np.round(results_df["p-val"].values[0], n_pdecimals) cl = np.round(results_df["CLES"].values[0], n_decimals) median1 = np.round(np.nanmedian(x), n_decimals) lowerq1, higherq1 = np.round(np.nanpercentile(x, [25, 75]), n_decimals) lowerq2, higherq2 = np.round(np.nanpercentile(y, [25, 75]), n_decimals) median2 = np.round(np.nanmedian(y), n_decimals) sample_size1 = len(x) sample_size2 = len(y) n1 = "n" + UnicodeGrabber.to_sub(1) n2 = "n" + UnicodeGrabber.to_sub(2) if sample_size1 == sample_size2: sample_str = f"{n1} = {n2} = {sample_size1}" else: sample_str = f"{n1} = {sample_size1}, {n2} = {sample_size2}" stats_str = ( "(Wilcoxon signed-rank " + "W" + f" = {U}, CLES = {cl}, {sample_str}, " + "\u0070" + f" = {P}" ) if sided == "two-sided": stats_str += " two-tailed)." else: stats_str += " one-tailed)." if P < signif_level: differ_str = "differed significantly" else: differ_str = "did not differ significantly" if show_quartiles: results_str = ( f"Median [quartiles] {vname} in groups {group1_name} and {group2_name} were " + f"{median1} [{lowerq1}, {higherq1}] and {median2} [{lowerq2}, {higherq2}]" f"{unit_name}respectively; " f"the distributions in the two groups {differ_str} {stats_str}" ) else: results_str = ( f"Median {vname} in groups {group1_name} and {group2_name} were " + f"{median1} and {median2}" f"{unit_name}respectively; " f"the distributions in the two groups {differ_str} {stats_str}" ) if do_print: print(results_str) figure = None if do_plot: figure = plot_dists(x, y, ax=ax, **fmt_kwargs) results = { "results": results_df, "output": results_str, "figure": figure, } return results