From 050503a8c8bb30c3c70e62360b1a57adc5f6e1ba Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Dec 2023 15:33:26 -0800 Subject: [PATCH 01/18] STY: Bump ruff; use ruff's docstring/code formatting --- .pre-commit-config.yaml | 8 +- ci/code_checks.sh | 102 -- environment.yml | 1 - pandas/_config/config.py | 4 +- pandas/_libs/lib.pyi | 6 +- pandas/_testing/_warnings.py | 2 - pandas/_testing/asserters.py | 4 +- pandas/_testing/contexts.py | 3 +- pandas/conftest.py | 5 +- pandas/core/accessor.py | 3 +- pandas/core/algorithms.py | 38 +- pandas/core/apply.py | 35 +- pandas/core/arraylike.py | 30 +- pandas/core/arrays/_arrow_string_mixins.py | 3 +- pandas/core/arrays/_mixins.py | 4 +- pandas/core/arrays/arrow/accessors.py | 45 +- pandas/core/arrays/arrow/array.py | 55 +- pandas/core/arrays/base.py | 96 +- pandas/core/arrays/categorical.py | 164 ++- pandas/core/arrays/datetimelike.py | 48 +- pandas/core/arrays/datetimes.py | 107 +- pandas/core/arrays/integer.py | 4 +- pandas/core/arrays/interval.py | 28 +- pandas/core/arrays/masked.py | 7 +- pandas/core/arrays/period.py | 39 +- pandas/core/arrays/sparse/accessor.py | 20 +- pandas/core/arrays/sparse/array.py | 7 +- pandas/core/arrays/string_.py | 8 +- pandas/core/arrays/string_arrow.py | 5 +- pandas/core/arrays/timedeltas.py | 10 +- pandas/core/base.py | 30 +- pandas/core/computation/eval.py | 4 +- pandas/core/construction.py | 28 +- pandas/core/dtypes/base.py | 8 +- pandas/core/dtypes/cast.py | 12 +- pandas/core/dtypes/common.py | 140 ++- pandas/core/dtypes/concat.py | 8 +- pandas/core/dtypes/dtypes.py | 52 +- pandas/core/dtypes/inference.py | 2 +- pandas/core/dtypes/missing.py | 40 +- pandas/core/flags.py | 4 +- pandas/core/frame.py | 1107 +++++++++++------ pandas/core/generic.py | 1038 ++++++++++------ pandas/core/groupby/generic.py | 267 ++-- pandas/core/groupby/groupby.py | 576 ++++++--- pandas/core/groupby/grouper.py | 39 +- pandas/core/groupby/indexing.py | 6 +- pandas/core/indexers/objects.py | 18 +- pandas/core/indexers/utils.py | 6 +- pandas/core/indexes/accessors.py | 38 +- pandas/core/indexes/base.py | 293 +++-- pandas/core/indexes/category.py | 35 +- pandas/core/indexes/datetimelike.py | 8 +- pandas/core/indexes/datetimes.py | 62 +- pandas/core/indexes/interval.py | 25 +- pandas/core/indexes/multi.py | 209 ++-- pandas/core/indexes/period.py | 15 +- pandas/core/indexes/timedeltas.py | 20 +- pandas/core/indexing.py | 132 +- pandas/core/interchange/from_dataframe.py | 13 +- pandas/core/internals/blocks.py | 3 +- pandas/core/internals/concat.py | 8 +- pandas/core/missing.py | 4 +- pandas/core/ops/missing.py | 4 +- pandas/core/resample.py | 257 ++-- pandas/core/reshape/concat.py | 43 +- pandas/core/reshape/encoding.py | 58 +- pandas/core/reshape/melt.py | 143 ++- pandas/core/reshape/merge.py | 110 +- pandas/core/reshape/pivot.py | 64 +- pandas/core/reshape/reshape.py | 7 +- pandas/core/reshape/tile.py | 49 +- pandas/core/reshape/util.py | 2 +- pandas/core/series.py | 328 +++-- pandas/core/shared_docs.py | 44 +- pandas/core/strings/accessor.py | 262 ++-- pandas/core/tools/datetimes.py | 58 +- pandas/core/tools/numeric.py | 10 +- pandas/core/tools/timedeltas.py | 10 +- pandas/core/window/rolling.py | 45 +- pandas/errors/__init__.py | 233 ++-- pandas/io/clipboards.py | 4 +- pandas/io/common.py | 19 +- pandas/io/excel/_base.py | 70 +- pandas/io/excel/_calamine.py | 3 +- pandas/io/excel/_util.py | 4 +- pandas/io/feather_format.py | 4 +- pandas/io/formats/css.py | 9 +- pandas/io/formats/format.py | 8 +- pandas/io/formats/info.py | 4 +- pandas/io/formats/printing.py | 2 +- pandas/io/formats/style.py | 476 ++++--- pandas/io/formats/style_render.py | 56 +- pandas/io/gbq.py | 7 +- pandas/io/html.py | 4 +- pandas/io/json/_normalize.py | 14 +- pandas/io/json/_table_schema.py | 14 +- pandas/io/parquet.py | 12 +- pandas/io/parsers/c_parser_wrapper.py | 2 +- pandas/io/parsers/readers.py | 2 +- pandas/io/pickle.py | 16 +- pandas/io/pytables.py | 96 +- pandas/io/sql.py | 40 +- pandas/io/stata.py | 153 ++- pandas/io/xml.py | 26 +- pandas/plotting/_core.py | 222 ++-- pandas/plotting/_matplotlib/core.py | 5 +- pandas/plotting/_matplotlib/groupby.py | 22 +- pandas/plotting/_matplotlib/hist.py | 8 +- pandas/plotting/_matplotlib/timeseries.py | 4 +- pandas/plotting/_matplotlib/tools.py | 6 +- pandas/plotting/_misc.py | 155 ++- pandas/tests/dtypes/test_inference.py | 4 +- pandas/tests/dtypes/test_missing.py | 6 +- .../frame/methods/test_first_and_last.py | 8 +- pandas/tests/frame/methods/test_rank.py | 4 +- .../tests/frame/methods/test_reset_index.py | 4 +- .../tests/groupby/aggregate/test_aggregate.py | 8 +- .../indexes/datetimes/test_scalar_compat.py | 3 +- pandas/tests/indexes/multi/test_join.py | 2 +- pandas/tests/indexes/multi/test_sorting.py | 4 +- pandas/tests/indexes/numeric/test_indexing.py | 5 +- pandas/tests/indexes/numeric/test_join.py | 12 +- .../tests/indexing/multiindex/test_partial.py | 2 +- .../tests/indexing/multiindex/test_slice.py | 18 +- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/formats/style/test_html.py | 6 +- pandas/tests/io/formats/test_to_latex.py | 4 +- pandas/tests/io/json/test_ujson.py | 6 +- pandas/tests/io/parser/test_converters.py | 9 +- pandas/tests/io/parser/test_encoding.py | 4 +- pandas/tests/io/parser/test_read_fwf.py | 20 +- pandas/tests/io/parser/test_skiprows.py | 3 +- pandas/tests/io/parser/test_textreader.py | 10 +- .../tests/io/pytables/test_file_handling.py | 2 +- pandas/tests/io/test_html.py | 3 +- .../tests/scalar/timestamp/test_timestamp.py | 3 +- .../series/accessors/test_dt_accessor.py | 3 +- pandas/tests/strings/conftest.py | 2 +- pandas/tests/strings/test_api.py | 10 +- pandas/tests/strings/test_strings.py | 3 +- pandas/tests/test_algos.py | 4 +- .../tseries/offsets/test_business_hour.py | 30 +- .../offsets/test_custom_business_hour.py | 18 +- pandas/tseries/frequencies.py | 4 +- pandas/tseries/holiday.py | 27 +- pandas/util/_decorators.py | 37 +- pyproject.toml | 4 + requirements-dev.txt | 1 - scripts/tests/test_validate_docstrings.py | 65 +- scripts/validate_docstrings.py | 63 - 152 files changed, 5471 insertions(+), 3291 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 782340178f462..fdf100239cacd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,13 +18,8 @@ ci: # manual stage hooks skip: [pylint, pyright, mypy] repos: -- repo: https://github.com/hauntsaninja/black-pre-commit-mirror - # black compiled with mypyc - rev: 23.11.0 - hooks: - - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.6 + rev: v0.1.8 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -33,6 +28,7 @@ repos: name: ruff-selected-autofixes alias: ruff-selected-autofixes args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix] + - id: ruff-format - repo: https://github.com/jendrikseipp/vulture rev: 'v2.10' hooks: diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e41f625e583c0..d8814b83b46db 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -63,108 +63,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX02,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06 RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Partially validate docstrings (EX03)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \ - pandas.Series.dt.day_name \ - pandas.Series.str.len \ - pandas.Series.cat.set_categories \ - pandas.Series.plot.bar \ - pandas.Series.plot.hist \ - pandas.Series.plot.line \ - pandas.Series.to_sql \ - pandas.Series.to_latex \ - pandas.errors.CategoricalConversionWarning \ - pandas.errors.ChainedAssignmentError \ - pandas.errors.ClosedFileError \ - pandas.errors.DatabaseError \ - pandas.errors.IndexingError \ - pandas.errors.InvalidColumnName \ - pandas.errors.NumExprClobberingError \ - pandas.errors.PossibleDataLossError \ - pandas.errors.PossiblePrecisionLoss \ - pandas.errors.SettingWithCopyError \ - pandas.errors.SettingWithCopyWarning \ - pandas.errors.SpecificationError \ - pandas.errors.UndefinedVariableError \ - pandas.errors.ValueLabelTypeMismatch \ - pandas.Timestamp.ceil \ - pandas.Timestamp.floor \ - pandas.Timestamp.round \ - pandas.read_pickle \ - pandas.ExcelWriter \ - pandas.read_json \ - pandas.io.json.build_table_schema \ - pandas.DataFrame.to_latex \ - pandas.io.formats.style.Styler.to_latex \ - pandas.read_parquet \ - pandas.DataFrame.to_sql \ - pandas.read_stata \ - pandas.core.resample.Resampler.pipe \ - pandas.core.resample.Resampler.fillna \ - pandas.core.resample.Resampler.interpolate \ - pandas.plotting.scatter_matrix \ - pandas.pivot \ - pandas.merge_asof \ - pandas.wide_to_long \ - pandas.Index.rename \ - pandas.Index.droplevel \ - pandas.Index.isin \ - pandas.CategoricalIndex.set_categories \ - pandas.MultiIndex.names \ - pandas.MultiIndex.droplevel \ - pandas.IndexSlice \ - pandas.DatetimeIndex.month_name \ - pandas.DatetimeIndex.day_name \ - pandas.core.window.rolling.Rolling.corr \ - pandas.Grouper \ - pandas.core.groupby.SeriesGroupBy.apply \ - pandas.core.groupby.DataFrameGroupBy.apply \ - pandas.core.groupby.SeriesGroupBy.transform \ - pandas.core.groupby.SeriesGroupBy.pipe \ - pandas.core.groupby.DataFrameGroupBy.pipe \ - pandas.core.groupby.DataFrameGroupBy.describe \ - pandas.core.groupby.DataFrameGroupBy.idxmax \ - pandas.core.groupby.DataFrameGroupBy.idxmin \ - pandas.core.groupby.DataFrameGroupBy.value_counts \ - pandas.core.groupby.SeriesGroupBy.describe \ - pandas.core.groupby.DataFrameGroupBy.boxplot \ - pandas.core.groupby.DataFrameGroupBy.hist \ - pandas.io.formats.style.Styler.map \ - pandas.io.formats.style.Styler.apply_index \ - pandas.io.formats.style.Styler.map_index \ - pandas.io.formats.style.Styler.format \ - pandas.io.formats.style.Styler.format_index \ - pandas.io.formats.style.Styler.relabel_index \ - pandas.io.formats.style.Styler.hide \ - pandas.io.formats.style.Styler.set_td_classes \ - pandas.io.formats.style.Styler.set_tooltips \ - pandas.io.formats.style.Styler.set_uuid \ - pandas.io.formats.style.Styler.pipe \ - pandas.io.formats.style.Styler.highlight_between \ - pandas.io.formats.style.Styler.highlight_quantile \ - pandas.io.formats.style.Styler.background_gradient \ - pandas.io.formats.style.Styler.text_gradient \ - pandas.DataFrame.values \ - pandas.DataFrame.loc \ - pandas.DataFrame.iloc \ - pandas.DataFrame.groupby \ - pandas.DataFrame.describe \ - pandas.DataFrame.skew \ - pandas.DataFrame.var \ - pandas.DataFrame.idxmax \ - pandas.DataFrame.idxmin \ - pandas.DataFrame.last \ - pandas.DataFrame.pivot \ - pandas.DataFrame.sort_values \ - pandas.DataFrame.tz_convert \ - pandas.DataFrame.tz_localize \ - pandas.DataFrame.plot.bar \ - pandas.DataFrame.plot.hexbin \ - pandas.DataFrame.plot.hist \ - pandas.DataFrame.plot.line \ - pandas.DataFrame.hist \ - RET=$(($RET + $?)) ; echo $MSG "DONE" - fi ### DOCUMENTATION NOTEBOOKS ### diff --git a/environment.yml b/environment.yml index 0014e71ed0804..56e23aead58ab 100644 --- a/environment.yml +++ b/environment.yml @@ -75,7 +75,6 @@ dependencies: - cxx-compiler # code checks - - flake8=6.1.0 # run in subprocess over docstring examples - mypy=1.4.1 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=3.6.0 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index a776825732090..649a0080760ba 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -462,7 +462,9 @@ class option_context(ContextDecorator): Examples -------- >>> from pandas import option_context - >>> with option_context('display.max_rows', 10, 'display.max_columns', 5): + >>> with option_context( + ... "display.max_rows", 10, "display.max_columns", 5 + ... ): ... pass """ diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..32ecd264262d6 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -179,7 +179,8 @@ def indices_fast( sorted_labels: list[npt.NDArray[np.int64]], ) -> dict[Hashable, npt.NDArray[np.intp]]: ... def generate_slices( - labels: np.ndarray, ngroups: int # const intp_t[:] + labels: np.ndarray, + ngroups: int, # const intp_t[:] ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... def count_level_2d( mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True], @@ -209,5 +210,6 @@ def get_reverse_indexer( def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... def is_range_indexer( - left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1] + left: np.ndarray, + n: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f11dc11f6ac0d..288845b576f4f 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -75,10 +75,8 @@ class for all warnings. To raise multiple types of exceptions, >>> import warnings >>> with assert_produces_warning(): ... warnings.warn(UserWarning()) - ... >>> with assert_produces_warning(False): ... warnings.warn(RuntimeWarning()) - ... Traceback (most recent call last): ... AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e342f76dc724b..199324955d7a6 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -1128,8 +1128,8 @@ def assert_frame_equal( but with columns of differing dtypes. >>> from pandas.testing import assert_frame_equal - >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + >>> df2 = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) df1 equals itself. diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index eb6e4a917889a..3570ebaeffed5 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -70,9 +70,8 @@ def set_timezone(tz: str) -> Generator[None, None, None]: >>> tzlocal().tzname(datetime(2021, 1, 1)) # doctest: +SKIP 'IST' - >>> with set_timezone('US/Eastern'): + >>> with set_timezone("US/Eastern"): ... tzlocal().tzname(datetime(2021, 1, 1)) - ... 'EST' """ import time diff --git a/pandas/conftest.py b/pandas/conftest.py index 7bfd1b35f5314..e00c3712f930b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1750,7 +1750,10 @@ def any_skipna_inferred_dtype(request): >>> def test_something(any_skipna_inferred_dtype): ... inferred_dtype, values = any_skipna_inferred_dtype ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... assert ( + ... lib.infer_dtype(values, skipna=True) + ... == inferred_dtype + ... ) """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 698abb2ec4989..9c06981441aa1 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -265,7 +265,7 @@ def __init__(self, pandas_object): # noqa: E999 For consistency with pandas methods, you should raise an ``AttributeError`` if the data passed to your accessor has an incorrect dtype. - >>> pd.Series(['a', 'b']).dt + >>> pd.Series(["a", "b"]).dt Traceback (most recent call last): ... AttributeError: Can only use .dt accessor with datetimelike values @@ -276,6 +276,7 @@ def __init__(self, pandas_object): # noqa: E999 import pandas as pd + @pd.api.extensions.register_dataframe_accessor("geo") class GeoAccessor: def __init__(self, pandas_obj): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 15a07da76d2f7..63b7cb1661dcf 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -342,7 +342,11 @@ def unique(values): >>> pd.unique(pd.Series([2] + [1] * 5)) array([2, 1]) - >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) + >>> pd.unique( + ... pd.Series( + ... [pd.Timestamp("20160101"), pd.Timestamp("20160101")] + ... ) + ... ) array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') >>> pd.unique( @@ -379,7 +383,13 @@ def unique(values): ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] - >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc")))) + >>> pd.unique( + ... pd.Series( + ... pd.Categorical( + ... list("baabc"), categories=list("abc") + ... ) + ... ) + ... ) ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] @@ -387,7 +397,11 @@ def unique(values): >>> pd.unique( ... pd.Series( - ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) + ... pd.Categorical( + ... list("baabc"), + ... categories=list("abc"), + ... ordered=True, + ... ) ... ) ... ) ['b', 'a', 'c'] @@ -395,7 +409,11 @@ def unique(values): An array of tuples - >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) + >>> pd.unique( + ... pd.Series( + ... [("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")] + ... ).values + ... ) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ return unique_with_mask(values) @@ -1206,11 +1224,17 @@ def take( Setting ``allow_fill=True`` will place `fill_value` in those positions. - >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) + >>> pd.api.extensions.take( + ... np.array([10, 20, 30]), [0, 0, -1], allow_fill=True + ... ) array([10., 10., nan]) - >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, - ... fill_value=-10) + >>> pd.api.extensions.take( + ... np.array([10, 20, 30]), + ... [0, 0, -1], + ... allow_fill=True, + ... fill_value=-10, + ... ) array([ 10, 10, -10]) """ if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 169a44accf066..4a575615aa026 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1010,7 +1010,8 @@ def wrapper(*args, **kwargs): # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - self.func, **engine_kwargs # type: ignore[arg-type] + self.func, + **engine_kwargs, # type: ignore[arg-type] ) result = nb_looper(self.values, self.axis) # If we made the result 2-D, squeeze it back to 1-D @@ -1718,7 +1719,9 @@ def is_multi_agg_with_relabel(**kwargs) -> bool: -------- >>> is_multi_agg_with_relabel(a="max") False - >>> is_multi_agg_with_relabel(a_max=("a", "max"), a_min=("a", "min")) + >>> is_multi_agg_with_relabel( + ... a_max=("a", "max"), a_min=("a", "min") + ... ) True >>> is_multi_agg_with_relabel() False @@ -1788,14 +1791,18 @@ def normalize_keyword_aggregation( def _make_unique_kwarg_list( - seq: Sequence[tuple[Any, Any]] + seq: Sequence[tuple[Any, Any]], ) -> Sequence[tuple[Any, Any]]: """ Uniquify aggfunc name of the pairs in the order list Examples: -------- - >>> kwarg_list = [('a', ''), ('a', ''), ('b', '')] + >>> kwarg_list = [ + ... ("a", ""), + ... ("a", ""), + ... ("b", ""), + ... ] >>> _make_unique_kwarg_list(kwarg_list) [('a', '_0'), ('a', '_1'), ('b', '')] """ @@ -1826,13 +1833,19 @@ def relabel_result( -------- >>> from pandas.core.apply import relabel_result >>> result = pd.DataFrame( - ... {"A": [np.nan, 2, np.nan], "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}, - ... index=["max", "mean", "min"] + ... { + ... "A": [np.nan, 2, np.nan], + ... "C": [6, np.nan, np.nan], + ... "B": [np.nan, 4, 2.5], + ... }, + ... index=["max", "mean", "min"], ... ) >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} >>> columns = ("foo", "aab", "bar", "dat") >>> order = [0, 1, 2, 3] - >>> result_in_dict = relabel_result(result, funcs, columns, order) + >>> result_in_dict = relabel_result( + ... result, funcs, columns, order + ... ) >>> pd.DataFrame(result_in_dict, index=columns) A C B foo 2.0 NaN NaN @@ -1966,9 +1979,11 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: Examples -------- - >>> maybe_mangle_lambdas('sum') + >>> maybe_mangle_lambdas("sum") 'sum' - >>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + >>> maybe_mangle_lambdas( + ... [lambda: 1, lambda: 2] + ... ) # doctest: +SKIP [, .f(*args, **kwargs)>] """ @@ -2011,7 +2026,7 @@ def validate_func_kwargs( Examples -------- - >>> validate_func_kwargs({'one': 'min', 'two': 'max'}) + >>> validate_func_kwargs({"one": "min", "two": "max"}) (['one', 'two'], ['min', 'max']) """ tuple_given_message = "func is expected but received {} in **kwargs." diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index fd585b3e19468..c7e0a1b6204ce 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -119,8 +119,10 @@ def __add__(self, other): Examples -------- - >>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]}, - ... index=['elk', 'moose']) + >>> df = pd.DataFrame( + ... {"height": [1.5, 2.6], "weight": [500, 800]}, + ... index=["elk", "moose"], + ... ) >>> df height weight elk 1.5 500 @@ -128,14 +130,14 @@ def __add__(self, other): Adding a scalar affects all rows and columns. - >>> df[['height', 'weight']] + 1.5 + >>> df[["height", "weight"]] + 1.5 height weight elk 3.0 501.5 moose 4.1 801.5 Each element of a list is added to a column of the DataFrame, in order. - >>> df[['height', 'weight']] + [0.5, 1.5] + >>> df[["height", "weight"]] + [0.5, 1.5] height weight elk 2.0 501.5 moose 3.1 801.5 @@ -143,7 +145,7 @@ def __add__(self, other): Keys of a dictionary are aligned to the DataFrame, based on column names; each value in the dictionary is added to the corresponding column. - >>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5} + >>> df[["height", "weight"]] + {"height": 0.5, "weight": 1.5} height weight elk 2.0 501.5 moose 3.1 801.5 @@ -151,8 +153,8 @@ def __add__(self, other): When `other` is a :class:`Series`, the index of `other` is aligned with the columns of the DataFrame. - >>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height']) - >>> df[['height', 'weight']] + s1 + >>> s1 = pd.Series([0.5, 1.5], index=["weight", "height"]) + >>> df[["height", "weight"]] + s1 height weight elk 3.0 500.5 moose 4.1 800.5 @@ -161,13 +163,13 @@ def __add__(self, other): the :class:`Series` will not be reoriented. If index-wise alignment is desired, :meth:`DataFrame.add` should be used with `axis='index'`. - >>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose']) - >>> df[['height', 'weight']] + s2 + >>> s2 = pd.Series([0.5, 1.5], index=["elk", "moose"]) + >>> df[["height", "weight"]] + s2 elk height moose weight elk NaN NaN NaN NaN moose NaN NaN NaN NaN - >>> df[['height', 'weight']].add(s2, axis='index') + >>> df[["height", "weight"]].add(s2, axis="index") height weight elk 2.0 500.5 moose 4.1 801.5 @@ -175,9 +177,11 @@ def __add__(self, other): When `other` is a :class:`DataFrame`, both columns names and the index are aligned. - >>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]}, - ... index=['elk', 'moose', 'deer']) - >>> df[['height', 'weight']] + other + >>> other = pd.DataFrame( + ... {"height": [0.2, 0.4, 0.6]}, + ... index=["elk", "moose", "deer"], + ... ) + >>> df[["height", "weight"]] + other height weight deer NaN NaN elk 1.7 NaN diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index cc41985843574..cc194868158a8 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -52,7 +52,8 @@ def _str_get(self, i: int): self._pa_array, start=start, stop=stop, step=step ) null_value = pa.scalar( - None, type=self._pa_array.type # type: ignore[attr-defined] + None, + type=self._pa_array.type, # type: ignore[attr-defined] ) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cb8f802239146..f529502e8ab8d 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -343,7 +343,9 @@ def fillna( # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( - value, mask, len(self) # type: ignore[arg-type] + value, + mask, + len(self), # type: ignore[arg-type] ) if mask.any(): diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 7f88267943526..9391d9c4595fb 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -95,9 +95,7 @@ def len(self) -> Series: ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_( - ... pa.int64() - ... )) + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.len() 0 3 @@ -131,9 +129,7 @@ def __getitem__(self, key: int | slice) -> Series: ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_( - ... pa.int64() - ... )) + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list[0] 0 1 @@ -190,9 +186,7 @@ def flatten(self) -> Series: ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_( - ... pa.int64() - ... )) + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.flatten() 0 1 @@ -248,9 +242,14 @@ def dtypes(self) -> Series: ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) + ... dtype=pd.ArrowDtype( + ... pa.struct( + ... [ + ... ("version", pa.int64()), + ... ("project", pa.string()), + ... ] + ... ) + ... ), ... ) >>> s.struct.dtypes version int64[pyarrow] @@ -294,9 +293,14 @@ def field(self, name_or_index: str | int) -> Series: ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) + ... dtype=pd.ArrowDtype( + ... pa.struct( + ... [ + ... ("version", pa.int64()), + ... ("project", pa.string()), + ... ] + ... ) + ... ), ... ) Extract by field name. @@ -359,9 +363,14 @@ def explode(self) -> DataFrame: ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) + ... dtype=pd.ArrowDtype( + ... pa.struct( + ... [ + ... ("version", pa.int64()), + ... ("project", pa.string()), + ... ] + ... ) + ... ), ... ) >>> s.struct.explode() diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 84d6e2fb7ca53..19e2e1c41e95e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -828,11 +828,17 @@ def any(self, *, skipna: bool = True, **kwargs): The result indicates whether any element is truthy (and by default skips NAs): - >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any() + >>> pd.array( + ... [True, False, True], dtype="boolean[pyarrow]" + ... ).any() True - >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any() + >>> pd.array( + ... [True, False, pd.NA], dtype="boolean[pyarrow]" + ... ).any() True - >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any() + >>> pd.array( + ... [False, False, pd.NA], dtype="boolean[pyarrow]" + ... ).any() False >>> pd.array([], dtype="boolean[pyarrow]").any() False @@ -844,13 +850,21 @@ def any(self, *, skipna: bool = True, **kwargs): With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): - >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + >>> pd.array( + ... [True, False, pd.NA], dtype="boolean[pyarrow]" + ... ).any(skipna=False) True - >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any( + ... skipna=False + ... ) True - >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + >>> pd.array( + ... [False, False, pd.NA], dtype="boolean[pyarrow]" + ... ).any(skipna=False) - >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any( + ... skipna=False + ... ) """ return self._reduce("any", skipna=skipna, **kwargs) @@ -886,11 +900,15 @@ def all(self, *, skipna: bool = True, **kwargs): The result indicates whether all elements are truthy (and by default skips NAs): - >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all() + >>> pd.array( + ... [True, True, pd.NA], dtype="boolean[pyarrow]" + ... ).all() True >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all() True - >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all() + >>> pd.array( + ... [True, False, pd.NA], dtype="boolean[pyarrow]" + ... ).all() False >>> pd.array([], dtype="boolean[pyarrow]").all() True @@ -902,13 +920,21 @@ def all(self, *, skipna: bool = True, **kwargs): With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): - >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all( + ... skipna=False + ... ) - >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all( + ... skipna=False + ... ) - >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + >>> pd.array( + ... [True, False, pd.NA], dtype="boolean[pyarrow]" + ... ).all(skipna=False) False - >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all( + ... skipna=False + ... ) False """ return self._reduce("all", skipna=skipna, **kwargs) @@ -2687,7 +2713,8 @@ def _dt_tz_localize( "shift_backward": "earliest", "shift_forward": "latest", }.get( - nonexistent, None # type: ignore[arg-type] + nonexistent, + None, # type: ignore[arg-type] ) if nonexistent_pa is None: raise NotImplementedError(f"{nonexistent=} is not supported") diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 59c6d911cfaef..5dd6d03a82216 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -350,7 +350,9 @@ def _from_sequence_of_strings( Examples -------- - >>> pd.arrays.IntegerArray._from_sequence_of_strings(["1", "2", "3"]) + >>> pd.arrays.IntegerArray._from_sequence_of_strings( + ... ["1", "2", "3"] + ... ) [1, 2, 3] Length: 3, dtype: Int64 @@ -376,10 +378,17 @@ def _from_factorized(cls, values, original): Examples -------- - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), - ... pd.Interval(1, 5), pd.Interval(1, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [ + ... pd.Interval(0, 1), + ... pd.Interval(1, 5), + ... pd.Interval(1, 5), + ... ] + ... ) >>> codes, uniques = pd.factorize(interv_arr) - >>> pd.arrays.IntervalArray._from_factorized(uniques, interv_arr) + >>> pd.arrays.IntervalArray._from_factorized( + ... uniques, interv_arr + ... ) [(0, 1], (1, 5]] Length: 2, dtype: interval[int64, right] @@ -681,7 +690,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: Casting to another ``ExtensionDtype`` returns an ``ExtensionArray``: - >>> arr1 = arr.astype('Float64') + >>> arr1 = arr.astype("Float64") >>> arr1 [1.0, 2.0, 3.0] @@ -691,7 +700,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: Otherwise, we will get a Numpy ndarray: - >>> arr2 = arr.astype('float64') + >>> arr2 = arr.astype("float64") >>> arr2 array([1., 2., 3.]) >>> arr2.dtype @@ -934,16 +943,19 @@ def interpolate( Examples -------- - >>> arr = pd.arrays.NumpyExtensionArray(np.array([0, 1, np.nan, 3])) - >>> arr.interpolate(method="linear", - ... limit=3, - ... limit_direction="forward", - ... index=pd.Index([1, 2, 3, 4]), - ... fill_value=1, - ... copy=False, - ... axis=0, - ... limit_area="inside" - ... ) + >>> arr = pd.arrays.NumpyExtensionArray( + ... np.array([0, 1, np.nan, 3]) + ... ) + >>> arr.interpolate( + ... method="linear", + ... limit=3, + ... limit_direction="forward", + ... index=pd.Index([1, 2, 3, 4]), + ... fill_value=1, + ... copy=False, + ... axis=0, + ... limit_area="inside", + ... ) [0.0, 1.0, 2.0, 3.0] Length: 4, dtype: float64 @@ -1104,7 +1116,9 @@ def fillna( # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( - value, mask, len(self) # type: ignore[arg-type] + value, + mask, + len(self), # type: ignore[arg-type] ) if mask.any(): @@ -1448,8 +1462,17 @@ def factorize( Examples -------- - >>> idx1 = pd.PeriodIndex(["2014-01", "2014-01", "2014-02", "2014-02", - ... "2014-03", "2014-03"], freq="M") + >>> idx1 = pd.PeriodIndex( + ... [ + ... "2014-01", + ... "2014-01", + ... "2014-02", + ... "2014-02", + ... "2014-03", + ... "2014-03", + ... ], + ... freq="M", + ... ) >>> arr, idx = idx1.factorize() >>> arr array([0, 0, 1, 1, 2, 2]) @@ -1473,9 +1496,7 @@ def factorize( uniques_ea = self._from_factorized(uniques, self) return codes, uniques_ea - _extension_array_shared_docs[ - "repeat" - ] = """ + _extension_array_shared_docs["repeat"] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -1610,8 +1631,12 @@ def take(self, indices, allow_fill=False, fill_value=None): # type for the array, to the physical storage type for # the data, before passing to take. - result = take(data, indices, fill_value=fill_value, - allow_fill=allow_fill) + result = take( + data, + indices, + fill_value=fill_value, + allow_fill=allow_fill, + ) return self._from_sequence(result, dtype=self.dtype) """ # Implementer note: The `fill_value` parameter should be a user-facing @@ -1750,7 +1775,11 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: -------- >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): ... def _formatter(self, boxed=False): - ... return lambda x: '*' + str(x) + '*' if boxed else repr(x) + '*' + ... return ( + ... lambda x: "*" + str(x) + "*" + ... if boxed + ... else repr(x) + "*" + ... ) >>> MyExtensionArray(np.array([1, 2, 3, 4])) [1*, 2*, 3*, 4*] @@ -1885,7 +1914,7 @@ def _accumulate( Examples -------- >>> arr = pd.array([1, 2, 3]) - >>> arr._accumulate(name='cumsum') + >>> arr._accumulate(name="cumsum") [1, 3, 6] Length: 3, dtype: Int64 @@ -1990,10 +2019,11 @@ def _hash_pandas_object( Examples -------- - >>> pd.array([1, 2])._hash_pandas_object(encoding='utf-8', - ... hash_key="1000000000000000", - ... categorize=False - ... ) + >>> pd.array([1, 2])._hash_pandas_object( + ... encoding="utf-8", + ... hash_key="1000000000000000", + ... categorize=False, + ... ) array([ 6238072747940578789, 15839785061582574730], dtype=uint64) """ from pandas.core.util.hashing import hash_array @@ -2027,8 +2057,10 @@ def _explode(self) -> tuple[Self, npt.NDArray[np.uint64]]: Examples -------- >>> import pyarrow as pa - >>> a = pd.array([[1, 2, 3], [4], [5, 6]], - ... dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + >>> a = pd.array( + ... [[1, 2, 3], [4], [5, 6]], + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ... ) >>> a._explode() ( [1, 2, 3, 4, 5, 6] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 20aec52b606b6..e534307f0f6d6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -316,7 +316,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi [1, 2, 3, 1, 2, 3] Categories (3, int64): [1, 2, 3] - >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> pd.Categorical(["a", "b", "c", "a", "b", "c"]) ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] @@ -336,8 +336,11 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. - >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, - ... categories=['c', 'b', 'a']) + >>> c = pd.Categorical( + ... ["a", "b", "c", "a", "b", "c"], + ... ordered=True, + ... categories=["c", "b", "a"], + ... ) >>> c ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['c' < 'b' < 'a'] @@ -488,7 +491,7 @@ def dtype(self) -> CategoricalDtype: Examples -------- - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat ['a', 'b'] Categories (2, object): ['a' < 'b'] @@ -727,7 +730,7 @@ def from_codes( Examples -------- - >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype = pd.CategoricalDtype(["a", "b"], ordered=True) >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) ['a', 'b', 'a', 'b'] Categories (2, object): ['a' < 'b'] @@ -782,28 +785,32 @@ def categories(self) -> Index: -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.categories Index(['a', 'b', 'c'], dtype='object') - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd']) + >>> raw_cat = pd.Categorical( + ... ["a", "b", "c", "a"], categories=["b", "c", "d"] + ... ) >>> ser = pd.Series(raw_cat) >>> ser.cat.categories Index(['b', 'c', 'd'], dtype='object') For :class:`pandas.Categorical`: - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.categories Index(['a', 'b'], dtype='object') For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'c', 'b', 'a', 'c', 'b']) + >>> ci = pd.CategoricalIndex(["a", "c", "b", "a", "c", "b"]) >>> ci.categories Index(['a', 'b', 'c'], dtype='object') - >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) + >>> ci = pd.CategoricalIndex( + ... ["a", "c"], categories=["c", "b", "a"] + ... ) >>> ci.categories Index(['c', 'b', 'a'], dtype='object') """ @@ -818,32 +825,32 @@ def ordered(self) -> Ordered: -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.ordered False - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) + >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], ordered=True) >>> ser = pd.Series(raw_cat) >>> ser.cat.ordered True For :class:`pandas.Categorical`: - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.ordered True - >>> cat = pd.Categorical(['a', 'b'], ordered=False) + >>> cat = pd.Categorical(["a", "b"], ordered=False) >>> cat.ordered False For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=True) + >>> ci = pd.CategoricalIndex(["a", "b"], ordered=True) >>> ci.ordered True - >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=False) + >>> ci = pd.CategoricalIndex(["a", "b"], ordered=False) >>> ci.ordered False """ @@ -869,17 +876,19 @@ def codes(self) -> np.ndarray: -------- For :class:`pandas.Categorical`: - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.codes array([0, 1], dtype=int8) For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) >>> ci.codes array([0, 1, 2, 0, 1, 2], dtype=int8) - >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) + >>> ci = pd.CategoricalIndex( + ... ["a", "c"], categories=["c", "b", "a"] + ... ) >>> ci.codes array([2, 0], dtype=int8) """ @@ -898,12 +907,12 @@ def _set_categories(self, categories, fastpath: bool = False) -> None: Examples -------- - >>> c = pd.Categorical(['a', 'b']) + >>> c = pd.Categorical(["a", "b"]) >>> c ['a', 'b'] Categories (2, object): ['a', 'b'] - >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c._set_categories(pd.Index(["a", "c"])) >>> c ['a', 'c'] Categories (2, object): ['a', 'c'] @@ -967,7 +976,7 @@ def as_ordered(self) -> Self: -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.ordered False >>> ser = ser.cat.as_ordered() @@ -976,7 +985,7 @@ def as_ordered(self) -> Self: For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a"]) >>> ci.ordered False >>> ci = ci.as_ordered() @@ -998,7 +1007,7 @@ def as_unordered(self) -> Self: -------- For :class:`pandas.Series`: - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) + >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], ordered=True) >>> ser = pd.Series(raw_cat) >>> ser.cat.ordered True @@ -1008,7 +1017,7 @@ def as_unordered(self) -> Self: For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'], ordered=True) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a"], ordered=True) >>> ci.ordered True >>> ci = ci.as_unordered() @@ -1069,8 +1078,11 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): -------- For :class:`pandas.Series`: - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'], - ... categories=['a', 'b', 'c'], ordered=True) + >>> raw_cat = pd.Categorical( + ... ["a", "b", "c", "A"], + ... categories=["a", "b", "c"], + ... ordered=True, + ... ) >>> ser = pd.Series(raw_cat) >>> ser 0 a @@ -1080,7 +1092,7 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): dtype: category Categories (3, object): ['a' < 'b' < 'c'] - >>> ser.cat.set_categories(['A', 'B', 'C'], rename=True) + >>> ser.cat.set_categories(["A", "B", "C"], rename=True) 0 A 1 B 2 C @@ -1090,16 +1102,19 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'A'], - ... categories=['a', 'b', 'c'], ordered=True) + >>> ci = pd.CategoricalIndex( + ... ["a", "b", "c", "A"], + ... categories=["a", "b", "c"], + ... ordered=True, + ... ) >>> ci CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'], ordered=True, dtype='category') - >>> ci.set_categories(['A', 'b', 'c']) + >>> ci.set_categories(["A", "b", "c"]) CategoricalIndex([nan, 'b', 'c', nan], categories=['A', 'b', 'c'], ordered=True, dtype='category') - >>> ci.set_categories(['A', 'b', 'c'], rename=True) + >>> ci.set_categories(["A", "b", "c"], rename=True) CategoricalIndex(['A', 'b', 'c', nan], categories=['A', 'b', 'c'], ordered=True, dtype='category') """ @@ -1165,7 +1180,7 @@ def rename_categories(self, new_categories) -> Self: Examples -------- - >>> c = pd.Categorical(['a', 'a', 'b']) + >>> c = pd.Categorical(["a", "a", "b"]) >>> c.rename_categories([0, 1]) [0, 0, 1] Categories (2, int64): [0, 1] @@ -1173,7 +1188,7 @@ def rename_categories(self, new_categories) -> Self: For dict-like ``new_categories``, extra keys are ignored and categories not in the dictionary are passed through - >>> c.rename_categories({'a': 'A', 'c': 'C'}) + >>> c.rename_categories({"a": "A", "c": "C"}) ['A', 'A', 'b'] Categories (2, object): ['A', 'b'] @@ -1233,8 +1248,10 @@ def reorder_categories(self, new_categories, ordered=None) -> Self: -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') - >>> ser = ser.cat.reorder_categories(['c', 'b', 'a'], ordered=True) + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") + >>> ser = ser.cat.reorder_categories( + ... ["c", "b", "a"], ordered=True + ... ) >>> ser 0 a 1 b @@ -1253,11 +1270,11 @@ def reorder_categories(self, new_categories, ordered=None) -> Self: For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a"]) >>> ci CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, dtype='category') - >>> ci.reorder_categories(['c', 'b', 'a'], ordered=True) + >>> ci.reorder_categories(["c", "b", "a"], ordered=True) CategoricalIndex(['a', 'b', 'c', 'a'], categories=['c', 'b', 'a'], ordered=True, dtype='category') """ @@ -1303,12 +1320,12 @@ def add_categories(self, new_categories) -> Self: Examples -------- - >>> c = pd.Categorical(['c', 'b', 'c']) + >>> c = pd.Categorical(["c", "b", "c"]) >>> c ['c', 'b', 'c'] Categories (2, object): ['b', 'c'] - >>> c.add_categories(['d', 'a']) + >>> c.add_categories(["d", "a"]) ['c', 'b', 'c'] Categories (4, object): ['b', 'c', 'd', 'a'] """ @@ -1371,12 +1388,12 @@ def remove_categories(self, removals) -> Self: Examples -------- - >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) + >>> c = pd.Categorical(["a", "c", "b", "c", "d"]) >>> c ['a', 'c', 'b', 'c', 'd'] Categories (4, object): ['a', 'b', 'c', 'd'] - >>> c.remove_categories(['d', 'a']) + >>> c.remove_categories(["d", "a"]) [NaN, 'c', 'b', 'c', NaN] Categories (2, object): ['b', 'c'] """ @@ -1418,13 +1435,13 @@ def remove_unused_categories(self) -> Self: Examples -------- - >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) + >>> c = pd.Categorical(["a", "c", "b", "c", "d"]) >>> c ['a', 'c', 'b', 'c', 'd'] Categories (4, object): ['a', 'b', 'c', 'd'] - >>> c[2] = 'a' - >>> c[4] = 'c' + >>> c[2] = "a" + >>> c[4] = "c" >>> c ['a', 'c', 'a', 'c', 'c'] Categories (4, object): ['a', 'b', 'c', 'd'] @@ -1498,37 +1515,43 @@ def map( Examples -------- - >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat = pd.Categorical(["a", "b", "c"]) >>> cat ['a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] >>> cat.map(lambda x: x.upper(), na_action=None) ['A', 'B', 'C'] Categories (3, object): ['A', 'B', 'C'] - >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None) + >>> cat.map( + ... {"a": "first", "b": "second", "c": "third"}, + ... na_action=None, + ... ) ['first', 'second', 'third'] Categories (3, object): ['first', 'second', 'third'] If the mapping is one-to-one the ordering of the categories is preserved: - >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) + >>> cat = pd.Categorical(["a", "b", "c"], ordered=True) >>> cat ['a', 'b', 'c'] Categories (3, object): ['a' < 'b' < 'c'] - >>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None) + >>> cat.map({"a": 3, "b": 2, "c": 1}, na_action=None) [3, 2, 1] Categories (3, int64): [3 < 2 < 1] If the mapping is not one-to-one an :class:`~pandas.Index` is returned: - >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None) + >>> cat.map( + ... {"a": "first", "b": "second", "c": "first"}, + ... na_action=None, + ... ) Index(['first', 'second', 'first'], dtype='object') If a `dict` is used, all unmapped categories are mapped to `NaN` and the result is an :class:`~pandas.Index`: - >>> cat.map({'a': 'first', 'b': 'second'}, na_action=None) + >>> cat.map({"a": "first", "b": "second"}, na_action=None) Index(['first', 'second', nan], dtype='object') """ if na_action is lib.no_default: @@ -1640,7 +1663,7 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: Examples -------- - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) The following calls ``cat.__array__`` @@ -1904,12 +1927,14 @@ def argsort( Examples -------- - >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + >>> pd.Categorical(["b", "b", "a", "c"]).argsort() array([2, 0, 1, 3]) - >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], - ... categories=['c', 'b', 'a'], - ... ordered=True) + >>> cat = pd.Categorical( + ... ["b", "b", "a", "c"], + ... categories=["c", "b", "a"], + ... ordered=True, + ... ) >>> cat.argsort() array([3, 0, 1, 2]) @@ -2003,10 +2028,10 @@ def sort_values( >>> c.sort_values(ascending=False) [5, 2, 2, NaN, NaN] Categories (2, int64): [2, 5] - >>> c.sort_values(na_position='first') + >>> c.sort_values(na_position="first") [NaN, NaN, 2, 2, 5] Categories (2, int64): [2, 5] - >>> c.sort_values(ascending=False, na_position='first') + >>> c.sort_values(ascending=False, na_position="first") [NaN, NaN, 5, 2, 2] Categories (2, int64): [2, 5] """ @@ -2318,7 +2343,7 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: Examples -------- - >>> c = pd.Categorical(list('aabca')) + >>> c = pd.Categorical(list("aabca")) >>> c ['a', 'a', 'b', 'c', 'a'] Categories (3, object): ['a', 'b', 'c'] @@ -2456,7 +2481,9 @@ def unique(self) -> Self: >>> pd.Categorical(list("baabc")).unique() ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] - >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() + >>> pd.Categorical( + ... list("baab"), categories=list("abc"), ordered=True + ... ).unique() ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ @@ -2600,15 +2627,16 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: Examples -------- - >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', - ... 'hippo']) - >>> s.isin(['cow', 'lama']) + >>> s = pd.Categorical( + ... ["llama", "cow", "llama", "beetle", "llama", "hippo"] + ... ) + >>> s.isin(["cow", "llama"]) array([ True, True, True, False, True, False]) Passing a single string as ``s.isin('lama')`` will raise an error. Use a list of one element instead: - >>> s.isin(['lama']) + >>> s.isin(["llama"]) array([ True, False, True, False, True, False]) """ null_mask = np.asarray(isna(values)) @@ -2911,7 +2939,9 @@ def codes(self) -> Series: Examples -------- - >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"]) + >>> raw_cate = pd.Categorical( + ... ["a", "b", "c", "a"], categories=["a", "b"] + ... ) >>> ser = pd.Series(raw_cate) >>> ser.cat.codes 0 0 @@ -2968,8 +2998,8 @@ def recode_for_categories( Examples -------- - >>> old_cat = pd.Index(['b', 'a', 'c']) - >>> new_cat = pd.Index(['a', 'b']) + >>> old_cat = pd.Index(["b", "a", "c"]) + >>> new_cat = pd.Index(["a", "b"]) >>> codes = np.array([0, 1, 1, 2]) >>> recode_for_categories(codes, old_cat, new_cat) array([ 1, 0, 0, -1], dtype=int8) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 91dd40c2deced..67c9be721a553 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -269,7 +269,9 @@ def _unbox_scalar( Examples -------- - >>> arr = pd.arrays.DatetimeArray(np.array(['1970-01-01'], 'datetime64[ns]')) + >>> arr = pd.arrays.DatetimeArray( + ... np.array(["1970-01-01"], "datetime64[ns]") + ... ) >>> arr._unbox_scalar(arr[0]) numpy.datetime64('1970-01-01T00:00:00.000000000') """ @@ -881,20 +883,25 @@ def freqstr(self) -> str | None: -------- For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00"], freq="D") + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00"], freq="D" + ... ) >>> idx.freqstr 'D' The frequency can be inferred if there are more than 2 points: - >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], - ... freq="infer") + >>> idx = pd.DatetimeIndex( + ... ["2018-01-01", "2018-01-03", "2018-01-05"], freq="infer" + ... ) >>> idx.freqstr '2D' For PeriodIndex: - >>> idx = pd.PeriodIndex(["2023-1", "2023-2", "2023-3"], freq="M") + >>> idx = pd.PeriodIndex( + ... ["2023-1", "2023-2", "2023-3"], freq="M" + ... ) >>> idx.freqstr 'M' """ @@ -913,13 +920,17 @@ def inferred_freq(self) -> str | None: -------- For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) + >>> idx = pd.DatetimeIndex( + ... ["2018-01-01", "2018-01-03", "2018-01-05"] + ... ) >>> idx.inferred_freq '2D' For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta(["0 days", "10 days", "20 days"]) + >>> tdelta_idx = pd.to_timedelta( + ... ["0 days", "10 days", "20 days"] + ... ) >>> tdelta_idx TimedeltaIndex(['0 days', '10 days', '20 days'], dtype='timedelta64[ns]', freq=None) @@ -1238,7 +1249,9 @@ def _add_timedeltalike(self, other: Timedelta | TimedeltaArray): # incompatible type "Union[dtype[datetime64], DatetimeTZDtype, # dtype[timedelta64]]"; expected "Union[dtype[datetime64], DatetimeTZDtype]" return type(self)._simple_new( - res_values, dtype=self.dtype, freq=new_freq # type: ignore[arg-type] + res_values, + dtype=self.dtype, + freq=new_freq, # type: ignore[arg-type] ) @final @@ -1261,7 +1274,9 @@ def _add_nat(self): # incompatible type "Union[dtype[timedelta64], dtype[datetime64], # DatetimeTZDtype]"; expected "Union[dtype[datetime64], DatetimeTZDtype]" return type(self)._simple_new( - result, dtype=self.dtype, freq=None # type: ignore[arg-type] + result, + dtype=self.dtype, + freq=None, # type: ignore[arg-type] ) @final @@ -1597,7 +1612,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0): -------- For :class:`pandas.DatetimeIndex`: - >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx = pd.date_range("2001-01-01 00:00", periods=3) >>> idx DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq='D') @@ -1606,7 +1621,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0): For :class:`pandas.TimedeltaIndex`: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='D') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit="D") >>> tdelta_idx TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq=None) @@ -1776,9 +1791,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: Examples -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%%B %%d, %%Y, %%r') + >>> rng = pd.date_range( + ... pd.Timestamp("2018-03-10 09:00"), periods=3, freq="s" + ... ) + >>> rng.strftime("%%B %%d, %%Y, %%r") Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', 'March 10, 2018, 09:00:02 AM'], dtype='object') @@ -2160,7 +2176,9 @@ def as_unit(self, unit: str, round_ok: bool = True) -> Self: # error: Unexpected keyword argument "freq" for "_simple_new" of # "NDArrayBacked" [call-arg] return type(self)._simple_new( - new_values, dtype=new_dtype, freq=self.freq # type: ignore[call-arg] + new_values, + dtype=new_dtype, + freq=self.freq, # type: ignore[call-arg] ) # TODO: annotate other as DatetimeArray | TimedeltaArray | Timestamp | Timedelta diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0074645a482b2..a14a2eb05df7b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -202,8 +202,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] Examples -------- - >>> pd.arrays.DatetimeArray(pd.DatetimeIndex(['2023-01-01', '2023-01-02']), - ... freq='D') + >>> pd.arrays.DatetimeArray( + ... pd.DatetimeIndex(["2023-01-01", "2023-01-02"]), freq="D" + ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] Length: 2, dtype: datetime64[ns] @@ -592,7 +593,9 @@ def tz(self) -> tzinfo | None: -------- For Series: - >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.Series( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> s = pd.to_datetime(s) >>> s 0 2020-01-01 10:00:00+00:00 @@ -603,8 +606,9 @@ def tz(self) -> tzinfo | None: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.tz datetime.timezone.utc """ @@ -880,8 +884,12 @@ def tz_convert(self, tz) -> Self: With the `tz` parameter, we can change the DatetimeIndex to other time zones: - >>> dti = pd.date_range(start='2014-08-01 09:00', - ... freq='h', periods=3, tz='Europe/Berlin') + >>> dti = pd.date_range( + ... start="2014-08-01 09:00", + ... freq="h", + ... periods=3, + ... tz="Europe/Berlin", + ... ) >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', @@ -889,7 +897,7 @@ def tz_convert(self, tz) -> Self: '2014-08-01 11:00:00+02:00'], dtype='datetime64[ns, Europe/Berlin]', freq='h') - >>> dti.tz_convert('US/Central') + >>> dti.tz_convert("US/Central") DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], @@ -898,8 +906,12 @@ def tz_convert(self, tz) -> Self: With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): - >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', - ... periods=3, tz='Europe/Berlin') + >>> dti = pd.date_range( + ... start="2014-08-01 09:00", + ... freq="h", + ... periods=3, + ... tz="Europe/Berlin", + ... ) >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', @@ -1123,7 +1135,7 @@ def to_pydatetime(self) -> npt.NDArray[np.object_]: Examples -------- - >>> idx = pd.date_range('2018-02-27', periods=3) + >>> idx = pd.date_range("2018-02-27", periods=3) >>> idx.to_pydatetime() array([datetime.datetime(2018, 2, 27, 0, 0), datetime.datetime(2018, 2, 28, 0, 0), @@ -1156,8 +1168,12 @@ def normalize(self) -> Self: Examples -------- - >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', - ... periods=3, tz='Asia/Calcutta') + >>> idx = pd.date_range( + ... start="2014-08-01 10:00", + ... freq="h", + ... periods=3, + ... tz="Asia/Calcutta", + ... ) >>> idx DatetimeIndex(['2014-08-01 10:00:00+05:30', '2014-08-01 11:00:00+05:30', @@ -1207,10 +1223,16 @@ def to_period(self, freq=None) -> PeriodArray: Examples -------- - >>> df = pd.DataFrame({"y": [1, 2, 3]}, - ... index=pd.to_datetime(["2000-03-31 00:00:00", - ... "2000-05-31 00:00:00", - ... "2000-08-31 00:00:00"])) + >>> df = pd.DataFrame( + ... {"y": [1, 2, 3]}, + ... index=pd.to_datetime( + ... [ + ... "2000-03-31 00:00:00", + ... "2000-05-31 00:00:00", + ... "2000-08-31 00:00:00", + ... ] + ... ), + ... ) >>> df.index.to_period("M") PeriodIndex(['2000-03', '2000-05', '2000-08'], dtype='period[M]') @@ -1273,7 +1295,9 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01', freq='ME', periods=3)) + >>> s = pd.Series( + ... pd.date_range(start="2018-01", freq="ME", periods=3) + ... ) >>> s 0 2018-01-31 1 2018-02-28 @@ -1285,7 +1309,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: 2 March dtype: object - >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx = pd.date_range(start="2018-01", freq="ME", periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq='ME') @@ -1296,11 +1320,11 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx = pd.date_range(start="2018-01", freq="ME", periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq='ME') - >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + >>> idx.month_name(locale="pt_BR.utf8") # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ values = self._local_timestamps() @@ -1330,7 +1354,9 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01-01', freq='D', periods=3)) + >>> s = pd.Series( + ... pd.date_range(start="2018-01-01", freq="D", periods=3) + ... ) >>> s 0 2018-01-01 1 2018-01-02 @@ -1342,7 +1368,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: 2 Wednesday dtype: object - >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx = pd.date_range(start="2018-01-01", freq="D", periods=3) >>> idx DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq='D') @@ -1353,11 +1379,11 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: for example: ``idx.day_name(locale='pt_BR.utf8')`` will return day names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx = pd.date_range(start="2018-01-01", freq="D", periods=3) >>> idx DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq='D') - >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP + >>> idx.day_name(locale="pt_BR.utf8") # doctest: +SKIP Index(['Segunda', 'Terça', 'Quarta'], dtype='object') """ values = self._local_timestamps() @@ -1379,7 +1405,9 @@ def time(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.Series( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> s = pd.to_datetime(s) >>> s 0 2020-01-01 10:00:00+00:00 @@ -1392,8 +1420,9 @@ def time(self) -> npt.NDArray[np.object_]: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.time array([datetime.time(10, 0), datetime.time(11, 0)], dtype=object) """ @@ -1415,7 +1444,9 @@ def timetz(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.Series( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> s = pd.to_datetime(s) >>> s 0 2020-01-01 10:00:00+00:00 @@ -1428,8 +1459,9 @@ def timetz(self) -> npt.NDArray[np.object_]: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.timetz array([datetime.time(10, 0, tzinfo=datetime.timezone.utc), datetime.time(11, 0, tzinfo=datetime.timezone.utc)], dtype=object) @@ -1448,7 +1480,9 @@ def date(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.Series( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> s = pd.to_datetime(s) >>> s 0 2020-01-01 10:00:00+00:00 @@ -1461,8 +1495,9 @@ def date(self) -> npt.NDArray[np.object_]: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.date array([datetime.date(2020, 1, 1), datetime.date(2020, 2, 1)], dtype=object) """ @@ -1491,7 +1526,7 @@ def isocalendar(self) -> DataFrame: Examples -------- - >>> idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + >>> idx = pd.date_range(start="2019-12-29", freq="D", periods=4) >>> idx.isocalendar() year week day 2019-12-29 2019 52 7 @@ -2159,7 +2194,7 @@ def std( -------- For :class:`pandas.DatetimeIndex`: - >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx = pd.date_range("2001-01-01 00:00", periods=3) >>> idx DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq='D') diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f9384e25ba9d9..dc453f3e37c50 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -115,12 +115,12 @@ class IntegerArray(NumericArray): String aliases for the dtypes are also available. They are capitalized. - >>> pd.array([1, None, 3], dtype='Int32') + >>> pd.array([1, None, 3], dtype="Int32") [1, , 3] Length: 3, dtype: Int32 - >>> pd.array([1, None, 3], dtype='UInt16') + >>> pd.array([1, None, 3], dtype="UInt16") [1, , 3] Length: 3, dtype: UInt16 diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a19b304529383..ffc0338485de1 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -121,9 +121,7 @@ } -_interval_shared_docs[ - "class" -] = """ +_interval_shared_docs["class"] = """ %(summary)s Parameters @@ -1257,7 +1255,9 @@ def left(self) -> Index: Examples -------- - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(2, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(2, 5)] + ... ) >>> interv_arr [(0, 1], (2, 5]] @@ -1277,7 +1277,9 @@ def right(self) -> Index: Examples -------- - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(2, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(2, 5)] + ... ) >>> interv_arr [(0, 1], (2, 5]] @@ -1297,7 +1299,9 @@ def length(self) -> Index: Examples -------- - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(1, 5)] + ... ) >>> interv_arr [(0, 1], (1, 5]] @@ -1315,7 +1319,9 @@ def mid(self) -> Index: Examples -------- - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(1, 5)] + ... ) >>> interv_arr [(0, 1], (1, 5]] @@ -1415,7 +1421,9 @@ def closed(self) -> IntervalClosedType: For arrays: - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(1, 5)] + ... ) >>> interv_arr [(0, 1], (1, 5]] @@ -1481,9 +1489,7 @@ def set_closed(self, closed: IntervalClosedType) -> Self: dtype = IntervalDtype(left.dtype, closed=closed) return self._simple_new(left, right, dtype=dtype) - _interval_shared_docs[ - "is_non_overlapping_monotonic" - ] = """ + _interval_shared_docs["is_non_overlapping_monotonic"] = """ Return a boolean whether the %(klass)s is non-overlapping and monotonic. Non-overlapping means (no Intervals share points), and monotonic means diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 00c7276a2216e..48fa417aada28 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -451,7 +451,9 @@ def to_numpy( When no missing values are present, an equivalent dtype can be used. - >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + >>> pd.array([True, False], dtype="boolean").to_numpy( + ... dtype="bool" + ... ) array([ True, False]) >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") array([1, 2]) @@ -1097,7 +1099,8 @@ def value_counts(self, dropna: bool = True) -> Series: arr = IntegerArray(value_counts, mask) index = Index( self.dtype.construct_array_type()( - keys, mask_index # type: ignore[arg-type] + keys, + mask_index, # type: ignore[arg-type] ) ) return Series(arr, index=index, name="count", copy=False) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 0e2d4409b9f39..102f753f6cd2c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -172,8 +172,9 @@ class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): # type: ignore[misc] Examples -------- - >>> pd.arrays.PeriodArray(pd.PeriodIndex(['2023-01-01', - ... '2023-01-02'], freq='D')) + >>> pd.arrays.PeriodArray( + ... pd.PeriodIndex(["2023-01-01", "2023-01-02"], freq="D") + ... ) ['2023-01-01', '2023-01-02'] Length: 2, dtype: period[D] @@ -633,7 +634,9 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: Examples -------- - >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") + >>> idx = pd.PeriodIndex( + ... ["2023-01", "2023-02", "2023-03"], freq="M" + ... ) >>> idx.to_timestamp() DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01'], dtype='datetime64[ns]', freq='MS') @@ -719,16 +722,16 @@ def asfreq(self, freq=None, how: str = "E") -> Self: Examples -------- - >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='Y') + >>> pidx = pd.period_range("2010-01-01", "2015-01-01", freq="Y") >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], dtype='period[Y-DEC]') - >>> pidx.asfreq('M') + >>> pidx.asfreq("M") PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', '2015-12'], dtype='period[M]') - >>> pidx.asfreq('M', how='S') + >>> pidx.asfreq("M", how="S") PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01', '2015-01'], dtype='period[M]') """ @@ -1028,29 +1031,39 @@ def period_array( Examples -------- - >>> period_array([pd.Period('2017', freq='Y'), - ... pd.Period('2018', freq='Y')]) + >>> period_array( + ... [ + ... pd.Period("2017", freq="Y"), + ... pd.Period("2018", freq="Y"), + ... ] + ... ) ['2017', '2018'] Length: 2, dtype: period[Y-DEC] - >>> period_array([pd.Period('2017', freq='Y'), - ... pd.Period('2018', freq='Y'), - ... pd.NaT]) + >>> period_array( + ... [ + ... pd.Period("2017", freq="Y"), + ... pd.Period("2018", freq="Y"), + ... pd.NaT, + ... ] + ... ) ['2017', '2018', 'NaT'] Length: 3, dtype: period[Y-DEC] Integers that look like years are handled - >>> period_array([2000, 2001, 2002], freq='D') + >>> period_array([2000, 2001, 2002], freq="D") ['2000-01-01', '2001-01-01', '2002-01-01'] Length: 3, dtype: period[D] Datetime-like strings may also be passed - >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') + >>> period_array( + ... ["2000-Q1", "2000-Q2", "2000-Q3", "2000-Q4"], freq="Q" + ... ) ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] Length: 4, dtype: period[Q-DEC] diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index fc7debb1f31e4..2e0f5834f4e1e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -149,7 +149,7 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) ... (1, 1, "b", 0), ... (1, 1, "b", 1), ... (2, 1, "b", 0), - ... (2, 1, "b", 1) + ... (2, 1, "b", 1), ... ], ... names=["A", "B", "C", "D"], ... ) @@ -175,7 +175,9 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) dtype: Sparse[float64, nan] >>> A, rows, columns = ss.sparse.to_coo( - ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True + ... row_levels=["A", "B"], + ... column_levels=["C", "D"], + ... sort_labels=True, ... ) >>> A <3x4 sparse matrix of type '' @@ -237,8 +239,10 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): Examples -------- - >>> df = pd.DataFrame({"a": [1, 2, 0, 0], - ... "b": [3, 0, 0, 4]}, dtype="Sparse[int]") + >>> df = pd.DataFrame( + ... {"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, + ... dtype="Sparse[int]", + ... ) >>> df.sparse.density 0.5 """ @@ -348,7 +352,9 @@ def to_coo(self): Examples -------- - >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) + >>> df = pd.DataFrame( + ... {"A": pd.arrays.SparseArray([0, 1, 0, 1])} + ... ) >>> df.sparse.to_coo() <4x1 sparse matrix of type '' with 2 stored elements in COOrdinate format> @@ -383,7 +389,9 @@ def density(self) -> float: Examples -------- - >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) + >>> df = pd.DataFrame( + ... {"A": pd.arrays.SparseArray([0, 1, 0, 1])} + ... ) >>> df.sparse.density 0.5 """ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5db77db2a9c66..2aa4aaa61c737 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -454,7 +454,8 @@ def __init__( # error: Argument "dtype" to "asarray" has incompatible type # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" sparse_values = np.asarray( - data.sp_values, dtype=dtype # type: ignore[arg-type] + data.sp_values, + dtype=dtype, # type: ignore[arg-type] ) elif sparse_index is None: data = extract_array(data, extract_numpy=True) @@ -1241,7 +1242,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): IntIndex Indices: array([2, 3], dtype=int32) - >>> arr.astype(SparseDtype(np.dtype('int32'))) + >>> arr.astype(SparseDtype(np.dtype("int32"))) [0, 0, 1, 2] Fill: 0 IntIndex @@ -1250,7 +1251,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): Using a NumPy dtype with a different kind (e.g. float) will coerce just ``self.sp_values``. - >>> arr.astype(SparseDtype(np.dtype('float64'))) + >>> arr.astype(SparseDtype(np.dtype("float64"))) ... # doctest: +NORMALIZE_WHITESPACE [nan, nan, 1.0, 2.0] Fill: nan diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 00197a150fb97..a14c3f23fafa5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -326,7 +326,9 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") + >>> pd.array( + ... ["This is", "some text", None, "data."], dtype="string" + ... ) ['This is', 'some text', , 'data.'] Length: 4, dtype: string @@ -334,11 +336,11 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` will convert the values to strings. - >>> pd.array(['1', 1], dtype="object") + >>> pd.array(["1", 1], dtype="object") ['1', 1] Length: 2, dtype: object - >>> pd.array(['1', 1], dtype="string") + >>> pd.array(["1", 1], dtype="string") ['1', '1'] Length: 2, dtype: string diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 32ab3054c0f51..5caa7c1ee6a20 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -114,7 +114,10 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") + >>> pd.array( + ... ["This is", "some text", None, "data."], + ... dtype="string[pyarrow]", + ... ) ['This is', 'some text', , 'data.'] Length: 4, dtype: string diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index ccb63d6677b1a..9e9338e93cb72 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -132,7 +132,7 @@ class TimedeltaArray(dtl.TimelikeOps): Examples -------- - >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1h', '2h'])) + >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(["1h", "2h"])) ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -747,7 +747,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: -------- **Series** - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) >>> s 0 0 days 1 1 days @@ -766,7 +766,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: **TimedeltaIndex** - >>> idx = pd.to_timedelta(np.arange(5), unit='d') + >>> idx = pd.to_timedelta(np.arange(5), unit="d") >>> idx TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) @@ -787,7 +787,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: Examples -------- - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='D') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit="D") >>> tdelta_idx TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq=None) @@ -945,7 +945,7 @@ def components(self) -> DataFrame: Examples -------- - >>> tdelta_idx = pd.to_timedelta(['1 day 3 min 2 us 42 ns']) + >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) >>> tdelta_idx TimedeltaIndex(['1 days 00:03:00.000002042'], dtype='timedelta64[ns]', freq=None) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9c722bad019a4..f0980a45c85a8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -367,7 +367,7 @@ def ndim(self) -> Literal[1]: Examples -------- - >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s = pd.Series(["Ant", "Bear", "Cow"]) >>> s 0 Ant 1 Bear @@ -409,7 +409,7 @@ def item(self): For an index: - >>> s = pd.Series([1], index=['a']) + >>> s = pd.Series([1], index=["a"]) >>> s.index.item() 'a' """ @@ -426,7 +426,7 @@ def nbytes(self) -> int: -------- For Series: - >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s = pd.Series(["Ant", "Bear", "Cow"]) >>> s 0 Ant 1 Bear @@ -454,7 +454,7 @@ def size(self) -> int: -------- For Series: - >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s = pd.Series(["Ant", "Bear", "Cow"]) >>> s 0 Ant 1 Bear @@ -531,7 +531,7 @@ def array(self) -> ExtensionArray: For extension types, like Categorical, the actual ExtensionArray is returned - >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser = pd.Series(pd.Categorical(["a", "b", "a"])) >>> ser.array ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] @@ -610,7 +610,7 @@ def to_numpy( Examples -------- - >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser = pd.Series(pd.Categorical(["a", "b", "a"])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -618,7 +618,7 @@ def to_numpy( Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. - >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), Timestamp('2000-01-02 00:00:00+0100', tz='CET')], @@ -713,8 +713,16 @@ def argmax( -------- Consider dataset containing cereal calories - >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, - ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) + >>> s = pd.Series( + ... { + ... { + ... "Corn Flakes": 100.0, + ... "Almond Delight": 110.0, + ... "Cinnamon Toast Crunch": 120.0, + ... "Cocoa Puff": 110.0, + ... } + ... } + ... ) >>> s Corn Flakes 100.0 Almond Delight 110.0 @@ -1207,9 +1215,7 @@ def factorize( uniques = Index(uniques) return codes, uniques - _shared_docs[ - "searchsorted" - ] = """ + _shared_docs["searchsorted"] = """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted {klass} `self` such that, if the diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f1fe528de06f8..83e821c7ae497 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -283,7 +283,9 @@ def eval( Examples -------- - >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df = pd.DataFrame( + ... {"animal": ["dog", "pig"], "age": [10, 20]} + ... ) >>> df animal age 0 dog 10 diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d41a9c80a10ec..a907aa01d16af 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -169,7 +169,7 @@ def array( would no longer return a :class:`arrays.NumpyExtensionArray` backed by a NumPy array. - >>> pd.array(['a', 'b'], dtype=str) + >>> pd.array(["a", "b"], dtype=str) ['a', 'b'] Length: 2, dtype: str32 @@ -178,7 +178,7 @@ def array( data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. - >>> pd.array(['a', 'b'], dtype=np.dtype(">> pd.array(["a", "b"], dtype=np.dtype(" ['a', 'b'] Length: 2, dtype: str32 @@ -193,12 +193,12 @@ def array( rather than a ``NumpyExtensionArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. - >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') + >>> pd.array(["2015", "2016"], dtype="datetime64[ns]") ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] - >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]') + >>> pd.array(["1h", "2h"], dtype="timedelta64[ns]") ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -230,27 +230,35 @@ def array( >>> with pd.option_context("string_storage", "pyarrow"): ... arr = pd.array(["a", None, "c"]) - ... >>> arr ['a', , 'c'] Length: 3, dtype: string - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + >>> pd.array( + ... [ + ... pd.Period("2000", freq="D"), + ... pd.Period("2000", freq="D"), + ... ] + ... ) ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] You can use the string alias for `dtype` - >>> pd.array(['a', 'b', 'a'], dtype='category') + >>> pd.array(["a", "b", "a"], dtype="category") ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] Or specify the actual dtype - >>> pd.array(['a', 'b', 'a'], - ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) + >>> pd.array( + ... ["a", "b", "a"], + ... dtype=pd.CategoricalDtype( + ... ["a", "b", "c"], ordered=True + ... ), + ... ) ['a', 'b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] @@ -439,7 +447,7 @@ def extract_array( Examples -------- - >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + >>> extract_array(pd.Series(["a", "b", "c"], dtype="category")) ['a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 6b00a5284ec5b..fa1924a1b47aa 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -96,8 +96,7 @@ class property**. >>> from pandas.api.extensions import ExtensionArray >>> class ExtensionDtype: ... def __from_arrow__( - ... self, - ... array: pyarrow.Array | pyarrow.ChunkedArray + ... self, array: pyarrow.Array | pyarrow.ChunkedArray ... ) -> ExtensionArray: ... ... @@ -489,7 +488,10 @@ def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDt Examples -------- - >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype + >>> from pandas.api.extensions import ( + ... register_extension_dtype, + ... ExtensionDtype, + ... ) >>> @register_extension_dtype ... class MyExtensionDtype(ExtensionDtype): ... name = "myextension" diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d5144174d3c71..26ab214df2b56 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -588,7 +588,9 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type # "Type[Any]"; expected "Hashable" [arg-type] dtype, fill_value = _maybe_promote_cached( - dtype, fill_value, type(fill_value) # type: ignore[arg-type] + dtype, + fill_value, + type(fill_value), # type: ignore[arg-type] ) except TypeError: # if fill_value is not hashable (required for caching) @@ -890,10 +892,10 @@ def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]: Examples -------- - >>> np.asarray([1, '1']) + >>> np.asarray([1, "1"]) array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) + >>> infer_dtype_from_array([1, "1"]) (dtype('O'), [1, '1']) """ if isinstance(arr, np.ndarray): @@ -1607,7 +1609,9 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n Also, if you try to coerce float values to integers, it raises: - >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64")) + >>> maybe_cast_to_integer_array( + ... [1, 2, 3.5], dtype=np.dtype("int64") + ... ) Traceback (most recent call last): ... ValueError: Trying to coerce float values to integers diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2245359fd8eac..b2106990eeb04 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -312,9 +312,13 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: False >>> is_datetime64tz_dtype([1, 2, 3]) False - >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) # tz-naive + >>> is_datetime64tz_dtype( + ... pd.DatetimeIndex([1, 2, 3]) + ... ) # tz-naive False - >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + >>> is_datetime64tz_dtype( + ... pd.DatetimeIndex([1, 2, 3], tz="US/Eastern") + ... ) True >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -367,7 +371,7 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: False >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) True - >>> is_timedelta64_dtype('0 days') + >>> is_timedelta64_dtype("0 days") False """ if isinstance(arr_or_dtype, np.dtype): @@ -544,7 +548,7 @@ def is_string_dtype(arr_or_dtype) -> bool: True >>> is_string_dtype(int) False - >>> is_string_dtype(np.array(['a', 'b'])) + >>> is_string_dtype(np.array(["a", "b"])) True >>> is_string_dtype(pd.Series([1, 2])) False @@ -646,9 +650,9 @@ def is_integer_dtype(arr_or_dtype) -> bool: False >>> is_integer_dtype(np.uint64) True - >>> is_integer_dtype('int8') + >>> is_integer_dtype("int8") True - >>> is_integer_dtype('Int8') + >>> is_integer_dtype("Int8") True >>> is_integer_dtype(pd.Int8Dtype) True @@ -656,13 +660,13 @@ def is_integer_dtype(arr_or_dtype) -> bool: False >>> is_integer_dtype(np.timedelta64) False - >>> is_integer_dtype(np.array(['a', 'b'])) + >>> is_integer_dtype(np.array(["a", "b"])) False >>> is_integer_dtype(pd.Series([1, 2])) True >>> is_integer_dtype(np.array([], dtype=np.timedelta64)) False - >>> is_integer_dtype(pd.Index([1, 2.])) # float + >>> is_integer_dtype(pd.Index([1, 2.0])) # float False """ return _is_dtype_type( @@ -694,7 +698,9 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Examples -------- - >>> from pandas.core.dtypes.common import is_signed_integer_dtype + >>> from pandas.core.dtypes.common import ( + ... is_signed_integer_dtype, + ... ) >>> is_signed_integer_dtype(str) False >>> is_signed_integer_dtype(int) @@ -703,9 +709,9 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: False >>> is_signed_integer_dtype(np.uint64) # unsigned False - >>> is_signed_integer_dtype('int8') + >>> is_signed_integer_dtype("int8") True - >>> is_signed_integer_dtype('Int8') + >>> is_signed_integer_dtype("Int8") True >>> is_signed_integer_dtype(pd.Int8Dtype) True @@ -713,15 +719,17 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: False >>> is_signed_integer_dtype(np.timedelta64) False - >>> is_signed_integer_dtype(np.array(['a', 'b'])) + >>> is_signed_integer_dtype(np.array(["a", "b"])) False >>> is_signed_integer_dtype(pd.Series([1, 2])) True >>> is_signed_integer_dtype(np.array([], dtype=np.timedelta64)) False - >>> is_signed_integer_dtype(pd.Index([1, 2.])) # float + >>> is_signed_integer_dtype(pd.Index([1, 2.0])) # float False - >>> is_signed_integer_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned + >>> is_signed_integer_dtype( + ... np.array([1, 2], dtype=np.uint32) + ... ) # unsigned False """ return _is_dtype_type( @@ -759,17 +767,17 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: False >>> is_unsigned_integer_dtype(np.uint64) True - >>> is_unsigned_integer_dtype('uint8') + >>> is_unsigned_integer_dtype("uint8") True - >>> is_unsigned_integer_dtype('UInt8') + >>> is_unsigned_integer_dtype("UInt8") True >>> is_unsigned_integer_dtype(pd.UInt8Dtype) True - >>> is_unsigned_integer_dtype(np.array(['a', 'b'])) + >>> is_unsigned_integer_dtype(np.array(["a", "b"])) False >>> is_unsigned_integer_dtype(pd.Series([1, 2])) # signed False - >>> is_unsigned_integer_dtype(pd.Index([1, 2.])) # float + >>> is_unsigned_integer_dtype(pd.Index([1, 2.0])) # float False >>> is_unsigned_integer_dtype(np.array([1, 2], dtype=np.uint32)) True @@ -815,9 +823,9 @@ def is_int64_dtype(arr_or_dtype) -> bool: False >>> is_int64_dtype(np.int64) # doctest: +SKIP True - >>> is_int64_dtype('int8') # doctest: +SKIP + >>> is_int64_dtype("int8") # doctest: +SKIP False - >>> is_int64_dtype('Int8') # doctest: +SKIP + >>> is_int64_dtype("Int8") # doctest: +SKIP False >>> is_int64_dtype(pd.Int64Dtype) # doctest: +SKIP True @@ -825,13 +833,19 @@ def is_int64_dtype(arr_or_dtype) -> bool: False >>> is_int64_dtype(np.uint64) # unsigned # doctest: +SKIP False - >>> is_int64_dtype(np.array(['a', 'b'])) # doctest: +SKIP + >>> is_int64_dtype(np.array(["a", "b"])) # doctest: +SKIP False - >>> is_int64_dtype(np.array([1, 2], dtype=np.int64)) # doctest: +SKIP + >>> is_int64_dtype( + ... np.array([1, 2], dtype=np.int64) + ... ) # doctest: +SKIP True - >>> is_int64_dtype(pd.Index([1, 2.])) # float # doctest: +SKIP + >>> is_int64_dtype( + ... pd.Index([1, 2.0]) + ... ) # float # doctest: +SKIP False - >>> is_int64_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned # doctest: +SKIP + >>> is_int64_dtype( + ... np.array([1, 2], dtype=np.uint32) + ... ) # unsigned # doctest: +SKIP False """ # GH#52564 @@ -870,13 +884,17 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: True >>> is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) True - >>> is_datetime64_any_dtype(np.array(['a', 'b'])) + >>> is_datetime64_any_dtype(np.array(["a", "b"])) False >>> is_datetime64_any_dtype(np.array([1, 2])) False - >>> is_datetime64_any_dtype(np.array([], dtype="datetime64[ns]")) + >>> is_datetime64_any_dtype( + ... np.array([], dtype="datetime64[ns]") + ... ) True - >>> is_datetime64_any_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) + >>> is_datetime64_any_dtype( + ... pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]") + ... ) True """ if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): @@ -919,15 +937,21 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: False >>> is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) True - >>> is_datetime64_ns_dtype(np.array(['a', 'b'])) + >>> is_datetime64_ns_dtype(np.array(["a", "b"])) False >>> is_datetime64_ns_dtype(np.array([1, 2])) False - >>> is_datetime64_ns_dtype(np.array([], dtype="datetime64")) # no unit + >>> is_datetime64_ns_dtype( + ... np.array([], dtype="datetime64") + ... ) # no unit False - >>> is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]")) # wrong unit + >>> is_datetime64_ns_dtype( + ... np.array([], dtype="datetime64[ps]") + ... ) # wrong unit False - >>> is_datetime64_ns_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) + >>> is_datetime64_ns_dtype( + ... pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]") + ... ) True """ if arr_or_dtype is None: @@ -960,14 +984,20 @@ def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: Examples -------- - >>> from pandas.core.dtypes.common import is_timedelta64_ns_dtype - >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]')) + >>> from pandas.core.dtypes.common import ( + ... is_timedelta64_ns_dtype, + ... ) + >>> is_timedelta64_ns_dtype(np.dtype("m8[ns]")) True - >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]')) # Wrong frequency + >>> is_timedelta64_ns_dtype( + ... np.dtype("m8[ps]") + ... ) # Wrong frequency False - >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]")) True - >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) + >>> is_timedelta64_ns_dtype( + ... np.array([1, 2], dtype=np.timedelta64) + ... ) False """ return _is_dtype(arr_or_dtype, lambda dtype: dtype == TD64NS_DTYPE) @@ -996,13 +1026,19 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool: -------- >>> is_numeric_v_string_like(np.array([1]), "foo") True - >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) + >>> is_numeric_v_string_like( + ... np.array([1, 2]), np.array(["foo"]) + ... ) True - >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) + >>> is_numeric_v_string_like( + ... np.array(["foo"]), np.array([1, 2]) + ... ) True >>> is_numeric_v_string_like(np.array([1]), np.array([2])) False - >>> is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) + >>> is_numeric_v_string_like( + ... np.array(["foo"]), np.array(["foo"]) + ... ) False """ is_a_array = isinstance(a, np.ndarray) @@ -1047,15 +1083,19 @@ def needs_i8_conversion(dtype: DtypeObj | None) -> bool: False >>> needs_i8_conversion(np.dtype(np.datetime64)) True - >>> needs_i8_conversion(np.array(['a', 'b'])) + >>> needs_i8_conversion(np.array(["a", "b"])) False >>> needs_i8_conversion(pd.Series([1, 2])) False >>> needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) False - >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + >>> needs_i8_conversion( + ... pd.DatetimeIndex([1, 2, 3], tz="US/Eastern") + ... ) False - >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern").dtype) + >>> needs_i8_conversion( + ... pd.DatetimeIndex([1, 2, 3], tz="US/Eastern").dtype + ... ) True """ if isinstance(dtype, np.dtype): @@ -1092,11 +1132,11 @@ def is_numeric_dtype(arr_or_dtype) -> bool: False >>> is_numeric_dtype(np.timedelta64) False - >>> is_numeric_dtype(np.array(['a', 'b'])) + >>> is_numeric_dtype(np.array(["a", "b"])) False >>> is_numeric_dtype(pd.Series([1, 2])) True - >>> is_numeric_dtype(pd.Index([1, 2.])) + >>> is_numeric_dtype(pd.Index([1, 2.0])) True >>> is_numeric_dtype(np.array([], dtype=np.timedelta64)) False @@ -1168,11 +1208,11 @@ def is_float_dtype(arr_or_dtype) -> bool: False >>> is_float_dtype(float) True - >>> is_float_dtype(np.array(['a', 'b'])) + >>> is_float_dtype(np.array(["a", "b"])) False >>> is_float_dtype(pd.Series([1, 2])) False - >>> is_float_dtype(pd.Index([1, 2.])) + >>> is_float_dtype(pd.Index([1, 2.0])) True """ return _is_dtype_type(arr_or_dtype, classes(np.floating)) or _is_dtype( @@ -1210,7 +1250,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: True >>> is_bool_dtype(np.bool_) True - >>> is_bool_dtype(np.array(['a', 'b'])) + >>> is_bool_dtype(np.array(["a", "b"])) False >>> is_bool_dtype(pd.Series([1, 2])) False @@ -1294,13 +1334,13 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: Examples -------- >>> from pandas.api.types import is_extension_array_dtype - >>> arr = pd.Categorical(['a', 'b']) + >>> arr = pd.Categorical(["a", "b"]) >>> is_extension_array_dtype(arr) True >>> is_extension_array_dtype(arr.dtype) True - >>> arr = np.array(['a', 'b']) + >>> arr = np.array(["a", "b"]) >>> is_extension_array_dtype(arr.dtype) False """ @@ -1347,7 +1387,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: False >>> is_complex_dtype(np.complex128) True - >>> is_complex_dtype(np.array(['a', 'b'])) + >>> is_complex_dtype(np.array(["a", "b"])) False >>> is_complex_dtype(pd.Series([1, 2])) False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9ec662a6cd352..4b6587d37e8a4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -242,7 +242,9 @@ def union_categoricals( in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. - >>> pd.api.types.union_categoricals([a, b], sort_categories=True) + >>> pd.api.types.union_categoricals( + ... [a, b], sort_categories=True + ... ) ['b', 'c', 'a', 'b'] Categories (3, object): ['a', 'b', 'c'] @@ -278,8 +280,8 @@ def union_categoricals( containing categorical data, but note that the resulting array will always be a plain `Categorical` - >>> a = pd.Series(["b", "c"], dtype='category') - >>> b = pd.Series(["a", "b"], dtype='category') + >>> a = pd.Series(["b", "c"], dtype="category") + >>> b = pd.Series(["a", "b"], dtype="category") >>> pd.api.types.union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4c1654ab0f5e4..0af12bcc55bb0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -190,8 +190,8 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): Examples -------- - >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + >>> t = pd.CategoricalDtype(categories=["b", "a"], ordered=True) + >>> pd.Series(["a", "b", "a", "c"], dtype=t) 0 a 1 b 2 a @@ -288,14 +288,14 @@ def _from_values_or_dtype( >>> pd.CategoricalDtype._from_values_or_dtype() CategoricalDtype(categories=None, ordered=None, categories_dtype=None) >>> pd.CategoricalDtype._from_values_or_dtype( - ... categories=['a', 'b'], ordered=True + ... categories=["a", "b"], ordered=True ... ) CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) - >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True) - >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False) + >>> dtype1 = pd.CategoricalDtype(["a", "b"], ordered=True) + >>> dtype2 = pd.CategoricalDtype(["x", "y"], ordered=False) >>> c = pd.Categorical([0, 1], dtype=dtype1) >>> pd.CategoricalDtype._from_values_or_dtype( - ... c, ['x', 'y'], ordered=True, dtype=dtype2 + ... c, ["x", "y"], ordered=True, dtype=dtype2 ... ) Traceback (most recent call last): ... @@ -623,7 +623,9 @@ def categories(self) -> Index: Examples -------- - >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> cat_type = pd.CategoricalDtype( + ... categories=["a", "b"], ordered=True + ... ) >>> cat_type.categories Index(['a', 'b'], dtype='object') """ @@ -636,11 +638,15 @@ def ordered(self) -> Ordered: Examples -------- - >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> cat_type = pd.CategoricalDtype( + ... categories=["a", "b"], ordered=True + ... ) >>> cat_type.ordered True - >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=False) + >>> cat_type = pd.CategoricalDtype( + ... categories=["a", "b"], ordered=False + ... ) >>> cat_type.ordered False """ @@ -719,10 +725,10 @@ class DatetimeTZDtype(PandasExtensionDtype): Examples -------- >>> from zoneinfo import ZoneInfo - >>> pd.DatetimeTZDtype(tz=ZoneInfo('UTC')) + >>> pd.DatetimeTZDtype(tz=ZoneInfo("UTC")) datetime64[ns, UTC] - >>> pd.DatetimeTZDtype(tz=ZoneInfo('Europe/Paris')) + >>> pd.DatetimeTZDtype(tz=ZoneInfo("Europe/Paris")) datetime64[ns, Europe/Paris] """ @@ -795,7 +801,9 @@ def unit(self) -> str_type: Examples -------- >>> from zoneinfo import ZoneInfo - >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo('America/Los_Angeles')) + >>> dtype = pd.DatetimeTZDtype( + ... tz=ZoneInfo("America/Los_Angeles") + ... ) >>> dtype.unit 'ns' """ @@ -809,7 +817,9 @@ def tz(self) -> tzinfo: Examples -------- >>> from zoneinfo import ZoneInfo - >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo('America/Los_Angeles')) + >>> dtype = pd.DatetimeTZDtype( + ... tz=ZoneInfo("America/Los_Angeles") + ... ) >>> dtype.tz zoneinfo.ZoneInfo(key='America/Los_Angeles') """ @@ -842,7 +852,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype: Examples -------- - >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]') + >>> DatetimeTZDtype.construct_from_string("datetime64[ns, UTC]") datetime64[ns, UTC] """ if not isinstance(string, str): @@ -964,7 +974,7 @@ class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): Examples -------- - >>> pd.PeriodDtype(freq='D') + >>> pd.PeriodDtype(freq="D") period[D] >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) @@ -1028,7 +1038,7 @@ def freq(self) -> BaseOffset: Examples -------- - >>> dtype = pd.PeriodDtype(freq='D') + >>> dtype = pd.PeriodDtype(freq="D") >>> dtype.freq """ @@ -1183,7 +1193,7 @@ class IntervalDtype(PandasExtensionDtype): Examples -------- - >>> pd.IntervalDtype(subtype='int64', closed='both') + >>> pd.IntervalDtype(subtype="int64", closed="both") interval[int64, both] """ @@ -1283,7 +1293,7 @@ def subtype(self): Examples -------- - >>> dtype = pd.IntervalDtype(subtype='int64', closed='both') + >>> dtype = pd.IntervalDtype(subtype="int64", closed="both") >>> dtype.subtype dtype('int64') """ @@ -1644,7 +1654,9 @@ class SparseDtype(ExtensionDtype): Examples -------- - >>> ser = pd.Series([1, 0, 0], dtype=pd.SparseDtype(dtype=int, fill_value=0)) + >>> ser = pd.Series( + ... [1, 0, 0], dtype=pd.SparseDtype(dtype=int, fill_value=0) + ... ) >>> ser 0 1 1 0 @@ -2001,7 +2013,7 @@ def _subtype_with_str(self): >>> SparseDtype(object, 1)._subtype_with_str dtype('O') - >>> dtype = SparseDtype(str, '') + >>> dtype = SparseDtype(str, "") >>> dtype.subtype dtype('O') diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index f551716772f61..dd5d3c3c442b6 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -425,7 +425,7 @@ def is_dataclass(item) -> bool: >>> is_dataclass(Point) False - >>> is_dataclass(Point(0,2)) + >>> is_dataclass(Point(0, 2)) True """ diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4dc0d477f89e8..eb18c9934b89c 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -129,7 +129,7 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: -------- Scalar arguments (including strings) result in a scalar boolean. - >>> pd.isna('dog') + >>> pd.isna("dog") False >>> pd.isna(pd.NA) @@ -150,8 +150,9 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: For indexes, an ndarray of booleans is returned. - >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, - ... "2017-07-08"]) + >>> index = pd.DatetimeIndex( + ... ["2017-07-05", "2017-07-06", None, "2017-07-08"] + ... ) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], dtype='datetime64[ns]', freq=None) @@ -160,7 +161,9 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: For Series and DataFrame, the same type is returned, containing booleans. - >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df = pd.DataFrame( + ... [["ant", "bee", "cat"], ["dog", None, "fly"]] + ... ) >>> df 0 1 2 0 ant bee cat @@ -408,7 +411,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: -------- Scalar arguments (including strings) result in a scalar boolean. - >>> pd.notna('dog') + >>> pd.notna("dog") True >>> pd.notna(pd.NA) @@ -429,8 +432,9 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: For indexes, an ndarray of booleans is returned. - >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, - ... "2017-07-08"]) + >>> index = pd.DatetimeIndex( + ... ["2017-07-05", "2017-07-06", None, "2017-07-08"] + ... ) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], dtype='datetime64[ns]', freq=None) @@ -439,7 +443,9 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: For Series and DataFrame, the same type is returned, containing booleans. - >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df = pd.DataFrame( + ... [["ant", "bee", "cat"], ["dog", None, "fly"]] + ... ) >>> df 0 1 2 0 ant bee cat @@ -496,12 +502,12 @@ def array_equivalent( Examples -------- >>> array_equivalent( - ... np.array([1, 2, np.nan]), - ... np.array([1, 2, np.nan])) + ... np.array([1, 2, np.nan]), np.array([1, 2, np.nan]) + ... ) True >>> array_equivalent( - ... np.array([1, np.nan, 2]), - ... np.array([1, 2, np.nan])) + ... np.array([1, np.nan, 2]), np.array([1, 2, np.nan]) + ... ) False """ left, right = np.asarray(left), np.asarray(right) @@ -671,15 +677,15 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): Examples -------- - >>> na_value_for_dtype(np.dtype('int64')) + >>> na_value_for_dtype(np.dtype("int64")) 0 - >>> na_value_for_dtype(np.dtype('int64'), compat=False) + >>> na_value_for_dtype(np.dtype("int64"), compat=False) nan - >>> na_value_for_dtype(np.dtype('float64')) + >>> na_value_for_dtype(np.dtype("float64")) nan - >>> na_value_for_dtype(np.dtype('bool')) + >>> na_value_for_dtype(np.dtype("bool")) False - >>> na_value_for_dtype(np.dtype('datetime64[ns]')) + >>> na_value_for_dtype(np.dtype("datetime64[ns]")) numpy.datetime64('NaT') """ diff --git a/pandas/core/flags.py b/pandas/core/flags.py index aff7a15f283ba..0a8a35d7aa729 100644 --- a/pandas/core/flags.py +++ b/pandas/core/flags.py @@ -41,7 +41,7 @@ class Flags: >>> df.flags - >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags["allows_duplicate_labels"] = True >>> df.flags """ @@ -71,7 +71,7 @@ def allows_duplicate_labels(self) -> bool: Examples -------- - >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df = pd.DataFrame({"A": [1, 2]}, index=["a", "a"]) >>> df.flags.allows_duplicate_labels True >>> df.flags.allows_duplicate_labels = False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a8387ce02dfb..67f8494a035c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -560,7 +560,7 @@ class DataFrame(NDFrame, OpsMixin): -------- Constructing DataFrame from a dictionary. - >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> d = {"col1": [1, 2], "col2": [3, 4]} >>> df = pd.DataFrame(data=d) >>> df col1 col2 @@ -584,7 +584,10 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from a dictionary including Series: - >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} + >>> d = { + ... "col1": [0, 1, 2, 3], + ... "col2": pd.Series([2, 3], index=[2, 3]), + ... } >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) col1 col2 0 0 NaN @@ -594,8 +597,10 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from numpy ndarray: - >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - ... columns=['a', 'b', 'c']) + >>> df2 = pd.DataFrame( + ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ... columns=["a", "b", "c"], + ... ) >>> df2 a b c 0 1 2 3 @@ -604,10 +609,11 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from a numpy ndarray that has labeled columns: - >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], - ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) - >>> df3 = pd.DataFrame(data, columns=['c', 'a']) - ... + >>> data = np.array( + ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")], + ... ) + >>> df3 = pd.DataFrame(data, columns=["c", "a"]) >>> df3 c a 0 3 1 @@ -633,7 +639,9 @@ class DataFrame(NDFrame, OpsMixin): a 1 c 3 - >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) + >>> df1 = pd.DataFrame( + ... [1, 2, 3], index=["a", "b", "c"], columns=["x"] + ... ) >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) >>> df2 x @@ -937,12 +945,17 @@ def __dataframe__( Examples -------- - >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> df_not_necessarily_pandas = pd.DataFrame( + ... {"A": [1, 2], "B": [3, 4]} + ... ) + >>> interchange_object = ( + ... df_not_necessarily_pandas.__dataframe__() + ... ) >>> interchange_object.column_names() Index(['A', 'B'], dtype='object') - >>> df_pandas = (pd.api.interchange.from_dataframe - ... (interchange_object.select_columns_by_name(['A']))) + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) >>> df_pandas A 0 1 @@ -983,7 +996,7 @@ def axes(self) -> list[Index]: Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.axes [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='object')] @@ -1001,12 +1014,13 @@ def shape(self) -> tuple[int, int]: Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.shape (2, 2) - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], - ... 'col3': [5, 6]}) + >>> df = pd.DataFrame( + ... {"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]} + ... ) >>> df.shape (2, 3) """ @@ -1025,15 +1039,20 @@ def _is_homogeneous_type(self) -> bool: -------- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type True - >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + >>> DataFrame( + ... {"A": [1, 2], "B": [3.0, 4.0]} + ... )._is_homogeneous_type False Items with the same type but different sizes are considered different types. - >>> DataFrame({ - ... "A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type + >>> DataFrame( + ... { + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64), + ... } + ... )._is_homogeneous_type False """ # The "<" part of "<=" here is for empty DataFrame cases @@ -1308,7 +1327,7 @@ def to_string( Examples -------- - >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]} >>> df = pd.DataFrame(d) >>> print(df.to_string()) col1 col2 @@ -1378,7 +1397,7 @@ def style(self) -> Styler: Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3]}) + >>> df = pd.DataFrame({"A": [1, 2, 3]}) >>> df.style # doctest: +SKIP Please see @@ -1388,9 +1407,7 @@ def style(self) -> Styler: return Styler(self) - _shared_docs[ - "items" - ] = r""" + _shared_docs["items"] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with @@ -1481,15 +1498,15 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]: Examples -------- - >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) >>> row = next(df.iterrows())[1] >>> row int 1.0 float 1.5 Name: 0, dtype: float64 - >>> print(row['int'].dtype) + >>> print(row["int"].dtype) float64 - >>> print(df['int'].dtype) + >>> print(df["int"].dtype) int64 """ columns = self.columns @@ -1535,15 +1552,16 @@ def itertuples( Examples -------- - >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, - ... index=['dog', 'hawk']) + >>> df = pd.DataFrame( + ... {"num_legs": [4, 2], "num_wings": [0, 2]}, + ... index=["dog", "hawk"], + ... ) >>> df num_legs num_wings dog 4 0 hawk 2 2 >>> for row in df.itertuples(): ... print(row) - ... Pandas(Index='dog', num_legs=4, num_wings=0) Pandas(Index='hawk', num_legs=2, num_wings=2) @@ -1552,16 +1570,14 @@ def itertuples( >>> for row in df.itertuples(index=False): ... print(row) - ... Pandas(num_legs=4, num_wings=0) Pandas(num_legs=2, num_wings=2) With the `name` parameter set we set a custom name for the yielded namedtuples: - >>> for row in df.itertuples(name='Animal'): + >>> for row in df.itertuples(name="Animal"): ... print(row) - ... Animal(Index='dog', num_legs=4, num_wings=0) Animal(Index='hawk', num_legs=2, num_wings=2) """ @@ -1796,7 +1812,10 @@ def from_dict( -------- By default the keys of the dict become the DataFrame columns: - >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> data = { + ... "col_1": [3, 2, 1, 0], + ... "col_2": ["a", "b", "c", "d"], + ... } >>> pd.DataFrame.from_dict(data) col_1 col_2 0 3 a @@ -1807,8 +1826,11 @@ def from_dict( Specify ``orient='index'`` to create the DataFrame using dictionary keys as rows: - >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} - >>> pd.DataFrame.from_dict(data, orient='index') + >>> data = { + ... "row_1": [3, 2, 1, 0], + ... "row_2": ["a", "b", "c", "d"], + ... } + >>> pd.DataFrame.from_dict(data, orient="index") 0 1 2 3 row_1 3 2 1 0 row_2 a b c d @@ -1816,8 +1838,9 @@ def from_dict( When using the 'index' orientation, the column names can be specified manually: - >>> pd.DataFrame.from_dict(data, orient='index', - ... columns=['A', 'B', 'C', 'D']) + >>> pd.DataFrame.from_dict( + ... data, orient="index", columns=["A", "B", "C", "D"] + ... ) A B C D row_1 3 2 1 0 row_2 a b c d @@ -1825,12 +1848,14 @@ def from_dict( Specify ``orient='tight'`` to create the DataFrame using a 'tight' format: - >>> data = {'index': [('a', 'b'), ('a', 'c')], - ... 'columns': [('x', 1), ('y', 2)], - ... 'data': [[1, 3], [2, 4]], - ... 'index_names': ['n1', 'n2'], - ... 'column_names': ['z1', 'z2']} - >>> pd.DataFrame.from_dict(data, orient='tight') + >>> data = { + ... "index": [("a", "b"), ("a", "c")], + ... "columns": [("x", 1), ("y", 2)], + ... "data": [[1, 3], [2, 4]], + ... "index_names": ["n1", "n2"], + ... "column_names": ["z1", "z2"], + ... } + >>> pd.DataFrame.from_dict(data, orient="tight") z1 x y z2 1 2 n1 n2 @@ -1928,7 +1953,7 @@ def to_numpy( For a mix of numeric and non-numeric types, the output array will have object dtype. - >>> df['C'] = pd.date_range('2000', periods=2) + >>> df["C"] = pd.date_range("2000", periods=2) >>> df.to_numpy() array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) @@ -2013,8 +2038,7 @@ def to_dict( orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[MutableMappingT] - | MutableMappingT = dict, # type: ignore[assignment] + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, ) -> MutableMappingT | list[MutableMappingT]: """ @@ -2070,9 +2094,10 @@ def to_dict( Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], - ... 'col2': [0.5, 0.75]}, - ... index=['row1', 'row2']) + >>> df = pd.DataFrame( + ... {"col1": [1, 2], "col2": [0.5, 0.75]}, + ... index=["row1", "row2"], + ... ) >>> df col1 col2 row1 1 0.50 @@ -2082,7 +2107,7 @@ def to_dict( You can specify the return orientation. - >>> df.to_dict('series') + >>> df.to_dict("series") {'col1': row1 1 row2 2 Name: col1, dtype: int64, @@ -2090,17 +2115,17 @@ def to_dict( row2 0.75 Name: col2, dtype: float64} - >>> df.to_dict('split') + >>> df.to_dict("split") {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]]} - >>> df.to_dict('records') + >>> df.to_dict("records") [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] - >>> df.to_dict('index') + >>> df.to_dict("index") {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} - >>> df.to_dict('tight') + >>> df.to_dict("tight") {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} @@ -2114,7 +2139,7 @@ def to_dict( If you want a `defaultdict`, you need to initialize it: >>> dd = defaultdict(list) - >>> df.to_dict('records', into=dd) + >>> df.to_dict("records", into=dd) [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] """ @@ -2231,16 +2256,17 @@ def to_gbq( `_ >>> project_id = "my-project" - >>> table_id = 'my_dataset.my_table' - >>> df = pd.DataFrame({ - ... "my_string": ["a", "b", "c"], - ... "my_int64": [1, 2, 3], - ... "my_float64": [4.0, 5.0, 6.0], - ... "my_bool1": [True, False, True], - ... "my_bool2": [False, True, False], - ... "my_dates": pd.date_range("now", periods=3), - ... } - ... ) + >>> table_id = "my_dataset.my_table" + >>> df = pd.DataFrame( + ... { + ... "my_string": ["a", "b", "c"], + ... "my_int64": [1, 2, 3], + ... "my_float64": [4.0, 5.0, 6.0], + ... "my_bool1": [True, False, True], + ... "my_bool2": [False, True, False], + ... "my_dates": pd.date_range("now", periods=3), + ... } + ... ) >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP """ @@ -2313,8 +2339,10 @@ def from_records( -------- Data can be provided as a structured ndarray: - >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], - ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) + >>> data = np.array( + ... [(3, "a"), (2, "b"), (1, "c"), (0, "d")], + ... dtype=[("col_1", "i4"), ("col_2", "U1")], + ... ) >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a @@ -2324,10 +2352,12 @@ def from_records( Data can be provided as a list of dicts: - >>> data = [{'col_1': 3, 'col_2': 'a'}, - ... {'col_1': 2, 'col_2': 'b'}, - ... {'col_1': 1, 'col_2': 'c'}, - ... {'col_1': 0, 'col_2': 'd'}] + >>> data = [ + ... {"col_1": 3, "col_2": "a"}, + ... {"col_1": 2, "col_2": "b"}, + ... {"col_1": 1, "col_2": "c"}, + ... {"col_1": 0, "col_2": "d"}, + ... ] >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a @@ -2337,8 +2367,8 @@ def from_records( Data can be provided as a list of tuples with corresponding columns: - >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] - >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) + >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")] + >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"]) col_1 col_2 0 3 a 1 2 b @@ -2529,8 +2559,9 @@ def to_records( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, - ... index=['a', 'b']) + >>> df = pd.DataFrame( + ... {"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"] + ... ) >>> df A B a 1 0.50 @@ -2803,10 +2834,20 @@ def to_stata( Examples -------- - >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', - ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}}) - >>> df.to_stata('animals.dta') # doctest: +SKIP + >>> df = pd.DataFrame( + ... { + ... { + ... "animal": [ + ... "falcon", + ... "parrot", + ... "falcon", + ... "parrot", + ... ], + ... "speed": [350, 18, 361, 15], + ... } + ... } + ... ) + >>> df.to_stata("animals.dta") # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): raise ValueError("Only formats 114, 117, 118 and 119 are supported.") @@ -3035,10 +3076,11 @@ def to_parquet( Examples -------- - >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) - >>> df.to_parquet('df.parquet.gzip', - ... compression='gzip') # doctest: +SKIP - >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP + >>> df = pd.DataFrame(data={{"col1": [1, 2], "col2": [3, 4]}}) + >>> df.to_parquet( + ... "df.parquet.gzip", compression="gzip" + ... ) # doctest: +SKIP + >>> pd.read_parquet("df.parquet.gzip") # doctest: +SKIP col1 col2 0 1 3 1 2 4 @@ -3133,9 +3175,9 @@ def to_orc( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) - >>> df.to_orc('df.orc') # doctest: +SKIP - >>> pd.read_orc('df.orc') # doctest: +SKIP + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> df.to_orc("df.orc") # doctest: +SKIP + >>> pd.read_orc("df.orc") # doctest: +SKIP col1 col2 0 1 4 1 2 3 @@ -3276,7 +3318,7 @@ def to_html( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) >>> html_string = ''' ... ... @@ -3481,9 +3523,15 @@ def to_xml( Examples -------- - >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], - ... 'degrees': [360, 360, 180], - ... 'sides': [4, np.nan, 3]}}) + >>> df = pd.DataFrame( + ... { + ... { + ... "shape": ["square", "circle", "triangle"], + ... "degrees": [360, 360, 180], + ... "sides": [4, np.nan, 3], + ... } + ... } + ... ) >>> df.to_xml() # doctest: +SKIP @@ -3508,9 +3556,9 @@ def to_xml( - >>> df.to_xml(attr_cols=[ - ... 'index', 'shape', 'degrees', 'sides' - ... ]) # doctest: +SKIP + >>> df.to_xml( + ... attr_cols=["index", "shape", "degrees", "sides"] + ... ) # doctest: +SKIP @@ -3518,8 +3566,10 @@ def to_xml( - >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, - ... prefix="doc") # doctest: +SKIP + >>> df.to_xml( + ... namespaces={{"doc": "https://example.com"}}, + ... prefix="doc", + ... ) # doctest: +SKIP @@ -3651,9 +3701,19 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: Examples -------- - >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) - ... for t in dtypes]) + >>> dtypes = [ + ... "int64", + ... "float64", + ... "complex128", + ... "object", + ... "bool", + ... ] + >>> data = dict( + ... [ + ... (t, np.ones(shape=5000, dtype=int).astype(t)) + ... for t in dtypes + ... ] + ... ) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool @@ -3694,7 +3754,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: Use a Categorical for efficient storage of an object-dtype column with many repeated values. - >>> df['object'].astype('category').memory_usage(deep=True) + >>> df["object"].astype("category").memory_usage(deep=True) 5244 """ result = self._constructor_sliced( @@ -3759,7 +3819,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: -------- **Square DataFrame with homogeneous dtype** - >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> d1 = {"col1": [1, 2], "col2": [3, 4]} >>> df1 = pd.DataFrame(data=d1) >>> df1 col1 col2 @@ -3786,10 +3846,12 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: **Non-square DataFrame with mixed dtypes** - >>> d2 = {'name': ['Alice', 'Bob'], - ... 'score': [9.5, 8], - ... 'employed': [False, True], - ... 'kids': [0, 0]} + >>> d2 = { + ... "name": ["Alice", "Bob"], + ... "score": [9.5, 8], + ... "employed": [False, True], + ... "kids": [0, 0], + ... } >>> df2 = pd.DataFrame(data=d2) >>> df2 name score employed kids @@ -3909,7 +3971,7 @@ def T(self) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df col1 col2 0 1 3 @@ -4725,9 +4787,13 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), - ... 'B': range(10, 0, -2), - ... 'C C': range(10, 5, -1)}) + >>> df = pd.DataFrame( + ... { + ... "A": range(1, 6), + ... "B": range(10, 0, -2), + ... "C C": range(10, 5, -1), + ... } + ... ) >>> df A B C C 0 1 10 10 @@ -4735,7 +4801,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No 2 3 6 8 3 4 4 7 4 5 2 6 - >>> df.query('A > B') + >>> df.query("A > B") A B C C 4 5 2 6 @@ -4747,13 +4813,13 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No For columns with spaces in their name, you can use backtick quoting. - >>> df.query('B == `C C`') + >>> df.query("B == `C C`") A B C C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df['C C']] + >>> df[df.B == df["C C"]] A B C C 0 1 10 10 """ @@ -4829,7 +4895,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)}) >>> df A B 0 1 10 @@ -4837,7 +4903,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 2 3 6 3 4 4 4 5 2 - >>> df.eval('A + B') + >>> df.eval("A + B") 0 11 1 10 2 9 @@ -4848,7 +4914,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval('C = A + B') + >>> df.eval("C = A + B") A B C 0 1 10 11 1 2 8 10 @@ -4935,9 +5001,13 @@ def select_dtypes(self, include=None, exclude=None) -> Self: Examples -------- - >>> df = pd.DataFrame({'a': [1, 2] * 3, - ... 'b': [True, False] * 3, - ... 'c': [1.0, 2.0] * 3}) + >>> df = pd.DataFrame( + ... { + ... "a": [1, 2] * 3, + ... "b": [True, False] * 3, + ... "c": [1.0, 2.0] * 3, + ... } + ... ) >>> df a b c 0 1 True 1.0 @@ -4947,7 +5017,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: 4 1 True 1.0 5 2 False 2.0 - >>> df.select_dtypes(include='bool') + >>> df.select_dtypes(include="bool") b 0 True 1 False @@ -4956,7 +5026,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: 4 True 5 False - >>> df.select_dtypes(include=['float64']) + >>> df.select_dtypes(include=["float64"]) c 0 1.0 1 2.0 @@ -4965,7 +5035,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: 4 1.0 5 2.0 - >>> df.select_dtypes(exclude=['int64']) + >>> df.select_dtypes(exclude=["int64"]) b c 0 True 1.0 1 False 2.0 @@ -5064,7 +5134,7 @@ def insert( Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df col1 col2 0 1 3 @@ -5144,8 +5214,9 @@ def assign(self, **kwargs) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, - ... index=['Portland', 'Berkeley']) + >>> df = pd.DataFrame( + ... {"temp_c": [17.0, 25.0]}, index=["Portland", "Berkeley"] + ... ) >>> df temp_c Portland 17.0 @@ -5161,7 +5232,7 @@ def assign(self, **kwargs) -> DataFrame: Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence: - >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + >>> df.assign(temp_f=df["temp_c"] * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 @@ -5169,8 +5240,10 @@ def assign(self, **kwargs) -> DataFrame: You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: - >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, - ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + >>> df.assign( + ... temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + ... temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, + ... ) temp_c temp_f temp_k Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 @@ -5437,8 +5510,10 @@ def drop( Examples -------- - >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), - ... columns=['A', 'B', 'C', 'D']) + >>> df = pd.DataFrame( + ... np.arange(12).reshape(3, 4), + ... columns=["A", "B", "C", "D"], + ... ) >>> df A B C D 0 0 1 2 3 @@ -5447,13 +5522,13 @@ def drop( Drop columns - >>> df.drop(['B', 'C'], axis=1) + >>> df.drop(["B", "C"], axis=1) A D 0 0 3 1 4 7 2 8 11 - >>> df.drop(columns=['B', 'C']) + >>> df.drop(columns=["B", "C"]) A D 0 0 3 1 4 7 @@ -5467,14 +5542,31 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame - >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], - ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], - ... [250, 150], [1.5, 0.8], [320, 250], - ... [1, 0.8], [0.3, 0.2]]) + >>> midx = pd.MultiIndex( + ... levels=[ + ... ["llama", "cow", "falcon"], + ... ["speed", "weight", "length"], + ... ], + ... codes=[ + ... [0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2], + ... ], + ... ) + >>> df = pd.DataFrame( + ... index=midx, + ... columns=["big", "small"], + ... data=[ + ... [45, 30], + ... [200, 100], + ... [1.5, 1], + ... [30, 20], + ... [250, 150], + ... [1.5, 0.8], + ... [320, 250], + ... [1, 0.8], + ... [0.3, 0.2], + ... ], + ... ) >>> df big small llama speed 45.0 30.0 @@ -5491,7 +5583,7 @@ def drop( DataFrame, i.e., drop the combination ``'falcon'`` and ``'weight'``, which deletes only the corresponding row - >>> df.drop(index=('falcon', 'weight')) + >>> df.drop(index=("falcon", "weight")) big small llama speed 45.0 30.0 weight 200.0 100.0 @@ -5502,7 +5594,7 @@ def drop( falcon speed 320.0 250.0 length 0.3 0.2 - >>> df.drop(index='cow', columns='small') + >>> df.drop(index="cow", columns="small") big llama speed 45.0 weight 200.0 @@ -5511,7 +5603,7 @@ def drop( weight 1.0 length 0.3 - >>> df.drop(index='length', level=1) + >>> df.drop(index="length", level=1) big small llama speed 45.0 30.0 weight 200.0 100.0 @@ -5688,19 +5780,21 @@ def rename( >>> df.rename(index=str).index Index(['0', '1', '2'], dtype='object') - >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") + >>> df.rename( + ... columns={"A": "a", "B": "b", "C": "c"}, errors="raise" + ... ) Traceback (most recent call last): KeyError: ['C'] not found in axis Using axis-style parameters: - >>> df.rename(str.lower, axis='columns') + >>> df.rename(str.lower, axis="columns") a b 0 1 4 1 2 5 2 3 6 - >>> df.rename({1: 2, 2: 4}, axis='index') + >>> df.rename({1: 2, 2: 4}, axis="index") A B 0 1 4 2 2 5 @@ -5732,11 +5826,15 @@ def pop(self, item: Hashable) -> Series: Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], - ... columns=('name', 'class', 'max_speed')) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ], + ... columns=("name", "class", "max_speed"), + ... ) >>> df name class max_speed 0 falcon bird 389.0 @@ -5744,7 +5842,7 @@ def pop(self, item: Hashable) -> Series: 2 lion mammal 80.5 3 monkey mammal NaN - >>> df.pop('class') + >>> df.pop("class") 0 bird 1 bird 2 mammal @@ -5977,9 +6075,13 @@ def set_index( Examples -------- - >>> df = pd.DataFrame({'month': [1, 4, 7, 10], - ... 'year': [2012, 2014, 2013, 2014], - ... 'sale': [55, 40, 84, 31]}) + >>> df = pd.DataFrame( + ... { + ... "month": [1, 4, 7, 10], + ... "year": [2012, 2014, 2013, 2014], + ... "sale": [55, 40, 84, 31], + ... } + ... ) >>> df month year sale 0 1 2012 55 @@ -5989,7 +6091,7 @@ def set_index( Set the index to become the 'month' column: - >>> df.set_index('month') + >>> df.set_index("month") year sale month 1 2012 55 @@ -5999,7 +6101,7 @@ def set_index( Create a MultiIndex using columns 'year' and 'month': - >>> df.set_index(['year', 'month']) + >>> df.set_index(["year", "month"]) sale year month 2012 1 55 @@ -6009,7 +6111,7 @@ def set_index( Create a MultiIndex using an Index and a column: - >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) + >>> df.set_index([pd.Index([1, 2, 3, 4]), "year"]) month sale year 1 2012 1 55 @@ -6235,12 +6337,16 @@ def reset_index( Examples -------- - >>> df = pd.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), - ... ('mammal', np.nan)], - ... index=['falcon', 'parrot', 'lion', 'monkey'], - ... columns=('class', 'max_speed')) + >>> df = pd.DataFrame( + ... [ + ... ("bird", 389.0), + ... ("bird", 24.0), + ... ("mammal", 80.5), + ... ("mammal", np.nan), + ... ], + ... index=["falcon", "parrot", "lion", "monkey"], + ... columns=("class", "max_speed"), + ... ) >>> df class max_speed falcon bird 389.0 @@ -6270,19 +6376,28 @@ class max_speed You can also use `reset_index` with `MultiIndex`. - >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), - ... ('bird', 'parrot'), - ... ('mammal', 'lion'), - ... ('mammal', 'monkey')], - ... names=['class', 'name']) - >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), - ... ('species', 'type')]) - >>> df = pd.DataFrame([(389.0, 'fly'), - ... (24.0, 'fly'), - ... (80.5, 'run'), - ... (np.nan, 'jump')], - ... index=index, - ... columns=columns) + >>> index = pd.MultiIndex.from_tuples( + ... [ + ... ("bird", "falcon"), + ... ("bird", "parrot"), + ... ("mammal", "lion"), + ... ("mammal", "monkey"), + ... ], + ... names=["class", "name"], + ... ) + >>> columns = pd.MultiIndex.from_tuples( + ... [("speed", "max"), ("species", "type")] + ... ) + >>> df = pd.DataFrame( + ... [ + ... (389.0, "fly"), + ... (24.0, "fly"), + ... (80.5, "run"), + ... (np.nan, "jump"), + ... ], + ... index=index, + ... columns=columns, + ... ) >>> df speed species max type @@ -6294,7 +6409,7 @@ class name Using the `names` parameter, choose a name for the index column: - >>> df.reset_index(names=['classes', 'names']) + >>> df.reset_index(names=["classes", "names"]) classes names speed species max type 0 bird falcon 389.0 fly @@ -6304,7 +6419,7 @@ class name If the index has multiple levels, we can reset a subset of them: - >>> df.reset_index(level='class') + >>> df.reset_index(level="class") class speed species max type name @@ -6316,7 +6431,7 @@ class speed species If we are not dropping the index, by default, it is placed in the top level. We can place it in another level: - >>> df.reset_index(level='class', col_level=1) + >>> df.reset_index(level="class", col_level=1) speed species class max type name @@ -6328,7 +6443,9 @@ class max type When the index is inserted under another level, we can specify under which one with the parameter `col_fill`: - >>> df.reset_index(level='class', col_level=1, col_fill='species') + >>> df.reset_index( + ... level="class", col_level=1, col_fill="species" + ... ) species speed species class max type name @@ -6339,7 +6456,7 @@ class max type If we specify a nonexistent level for `col_fill`, it is created: - >>> df.reset_index(level='class', col_level=1, col_fill='genus') + >>> df.reset_index(level="class", col_level=1, col_fill="genus") genus speed species class max type name @@ -6535,10 +6652,17 @@ def dropna( Examples -------- - >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], - ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), - ... pd.NaT]}) + >>> df = pd.DataFrame( + ... { + ... "name": ["Alfred", "Batman", "Catwoman"], + ... "toy": [np.nan, "Batmobile", "Bullwhip"], + ... "born": [ + ... pd.NaT, + ... pd.Timestamp("1940-04-25"), + ... pd.NaT, + ... ], + ... } + ... ) >>> df name toy born 0 Alfred NaN NaT @@ -6553,7 +6677,7 @@ def dropna( Drop the columns where at least one element is missing. - >>> df.dropna(axis='columns') + >>> df.dropna(axis="columns") name 0 Alfred 1 Batman @@ -6561,7 +6685,7 @@ def dropna( Drop the rows where all elements are missing. - >>> df.dropna(how='all') + >>> df.dropna(how="all") name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 @@ -6576,7 +6700,7 @@ def dropna( Define in which columns to look for missing values. - >>> df.dropna(subset=['name', 'toy']) + >>> df.dropna(subset=["name", "toy"]) name toy born 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT @@ -6711,11 +6835,19 @@ def drop_duplicates( -------- Consider dataset containing ramen rating. - >>> df = pd.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) + >>> df = pd.DataFrame( + ... { + ... "brand": [ + ... "Yum Yum", + ... "Yum Yum", + ... "Indomie", + ... "Indomie", + ... "Indomie", + ... ], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) >>> df brand style rating 0 Yum Yum cup 4.0 @@ -6735,14 +6867,14 @@ def drop_duplicates( To remove duplicates on specific column(s), use ``subset``. - >>> df.drop_duplicates(subset=['brand']) + >>> df.drop_duplicates(subset=["brand"]) brand style rating 0 Yum Yum cup 4.0 2 Indomie cup 3.5 To remove duplicates and keep last occurrences, use ``keep``. - >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + >>> df.drop_duplicates(subset=["brand", "style"], keep="last") brand style rating 1 Yum Yum cup 4.0 2 Indomie cup 3.5 @@ -6802,11 +6934,19 @@ def duplicated( -------- Consider dataset containing ramen rating. - >>> df = pd.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) + >>> df = pd.DataFrame( + ... { + ... "brand": [ + ... "Yum Yum", + ... "Yum Yum", + ... "Indomie", + ... "Indomie", + ... "Indomie", + ... ], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) >>> df brand style rating 0 Yum Yum cup 4.0 @@ -6829,7 +6969,7 @@ def duplicated( By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True. - >>> df.duplicated(keep='last') + >>> df.duplicated(keep="last") 0 True 1 False 2 False @@ -6849,7 +6989,7 @@ def duplicated( To find duplicates on specific column(s), use ``subset``. - >>> df.duplicated(subset=['brand']) + >>> df.duplicated(subset=["brand"]) 0 False 1 True 2 False @@ -7002,12 +7142,14 @@ def sort_values( Examples -------- - >>> df = pd.DataFrame({ - ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], - ... 'col2': [2, 1, 9, 8, 7, 4], - ... 'col3': [0, 1, 9, 4, 2, 3], - ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "col1": ["A", "A", "B", np.nan, "D", "C"], + ... "col2": [2, 1, 9, 8, 7, 4], + ... "col3": [0, 1, 9, 4, 2, 3], + ... "col4": ["a", "B", "c", "D", "e", "F"], + ... } + ... ) >>> df col1 col2 col3 col4 0 A 2 0 a @@ -7019,7 +7161,7 @@ def sort_values( Sort by col1 - >>> df.sort_values(by=['col1']) + >>> df.sort_values(by=["col1"]) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -7030,7 +7172,7 @@ def sort_values( Sort by multiple columns - >>> df.sort_values(by=['col1', 'col2']) + >>> df.sort_values(by=["col1", "col2"]) col1 col2 col3 col4 1 A 1 1 B 0 A 2 0 a @@ -7041,7 +7183,7 @@ def sort_values( Sort Descending - >>> df.sort_values(by='col1', ascending=False) + >>> df.sort_values(by="col1", ascending=False) col1 col2 col3 col4 4 D 7 2 e 5 C 4 3 F @@ -7052,7 +7194,9 @@ def sort_values( Putting NAs first - >>> df.sort_values(by='col1', ascending=False, na_position='first') + >>> df.sort_values( + ... by="col1", ascending=False, na_position="first" + ... ) col1 col2 col3 col4 3 NaN 8 4 D 4 D 7 2 e @@ -7063,7 +7207,7 @@ def sort_values( Sorting with a key function - >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) + >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -7075,10 +7219,12 @@ def sort_values( Natural sort with the key argument, using the `natsort ` package. - >>> df = pd.DataFrame({ - ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], - ... "value": [10, 20, 30, 40, 50] - ... }) + >>> df = pd.DataFrame( + ... { + ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], + ... "value": [10, 20, 30, 40, 50], + ... } + ... ) >>> df time value 0 0hr 10 @@ -7089,7 +7235,7 @@ def sort_values( >>> from natsort import index_natsorted >>> df.sort_values( ... by="time", - ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... key=lambda x: np.argsort(index_natsorted(df["time"])), ... ) time value 0 0hr 10 @@ -7290,8 +7436,11 @@ def sort_index( Examples -------- - >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], - ... columns=['A']) + >>> df = pd.DataFrame( + ... [1, 2, 3, 4, 5], + ... index=[100, 29, 234, 1, 150], + ... columns=["A"], + ... ) >>> df.sort_index() A 1 4 @@ -7314,7 +7463,9 @@ def sort_index( A key function can be specified which is applied to the index before sorting. For a ``MultiIndex`` this is applied to each level separately. - >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3, 4]}, index=["A", "b", "C", "d"] + ... ) >>> df.sort_index(key=lambda x: x.str.lower()) a A 1 @@ -7378,9 +7529,10 @@ def value_counts( Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], - ... 'num_wings': [2, 0, 0, 0]}, - ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df = pd.DataFrame( + ... {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + ... index=["falcon", "dog", "cat", "ant"], + ... ) >>> df num_legs num_wings falcon 2 2 @@ -7418,8 +7570,12 @@ def value_counts( With `dropna` set to `False` we can also count rows with NA values. - >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], - ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) + >>> df = pd.DataFrame( + ... { + ... "first_name": ["John", "Anne", "John", "Beth"], + ... "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + ... } + ... ) >>> df first_name middle_name 0 John Smith @@ -7517,16 +7673,54 @@ def nlargest( Examples -------- - >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 11300, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 11300, + ... 11300, + ... 11300, + ... ], + ... "GDP": [ + ... 1937894, + ... 2583560, + ... 12011, + ... 4520, + ... 12128, + ... 17036, + ... 182, + ... 38, + ... 311, + ... ], + ... "alpha-2": [ + ... "IT", + ... "FR", + ... "MT", + ... "MV", + ... "BN", + ... "IS", + ... "NR", + ... "TV", + ... "AI", + ... ], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) >>> df population GDP alpha-2 Italy 59000000 1937894 IT @@ -7542,7 +7736,7 @@ def nlargest( In the following example, we will use ``nlargest`` to select the three rows having the largest values in column "population". - >>> df.nlargest(3, 'population') + >>> df.nlargest(3, "population") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7550,7 +7744,7 @@ def nlargest( When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nlargest(3, 'population', keep='last') + >>> df.nlargest(3, "population", keep="last") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7560,7 +7754,7 @@ def nlargest( if there are duplicate values for the smallest element, all the ties are kept: - >>> df.nlargest(3, 'population', keep='all') + >>> df.nlargest(3, "population", keep="all") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7570,7 +7764,7 @@ def nlargest( However, ``nlargest`` does not keep ``n`` distinct largest elements: - >>> df.nlargest(5, 'population', keep='all') + >>> df.nlargest(5, "population", keep="all") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7581,7 +7775,7 @@ def nlargest( To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. - >>> df.nlargest(3, ['population', 'GDP']) + >>> df.nlargest(3, ["population", "GDP"]) population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7630,16 +7824,54 @@ def nsmallest( Examples -------- - >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 337000, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 337000, + ... 11300, + ... 11300, + ... ], + ... "GDP": [ + ... 1937894, + ... 2583560, + ... 12011, + ... 4520, + ... 12128, + ... 17036, + ... 182, + ... 38, + ... 311, + ... ], + ... "alpha-2": [ + ... "IT", + ... "FR", + ... "MT", + ... "MV", + ... "BN", + ... "IS", + ... "NR", + ... "TV", + ... "AI", + ... ], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) >>> df population GDP alpha-2 Italy 59000000 1937894 IT @@ -7655,7 +7887,7 @@ def nsmallest( In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "population". - >>> df.nsmallest(3, 'population') + >>> df.nsmallest(3, "population") population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7663,7 +7895,7 @@ def nsmallest( When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nsmallest(3, 'population', keep='last') + >>> df.nsmallest(3, "population", keep="last") population GDP alpha-2 Anguilla 11300 311 AI Tuvalu 11300 38 TV @@ -7673,7 +7905,7 @@ def nsmallest( if there are duplicate values for the largest element, all the ties are kept. - >>> df.nsmallest(3, 'population', keep='all') + >>> df.nsmallest(3, "population", keep="all") population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7683,7 +7915,7 @@ def nsmallest( However, ``nsmallest`` does not keep ``n`` distinct smallest elements: - >>> df.nsmallest(4, 'population', keep='all') + >>> df.nsmallest(4, "population", keep="all") population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7693,7 +7925,7 @@ def nsmallest( To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. - >>> df.nsmallest(3, ['population', 'GDP']) + >>> df.nsmallest(3, ["population", "GDP"]) population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7802,7 +8034,9 @@ def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFram ... "diet": ["Omnivore", "Carnivore", "Carnivore"], ... "species": ["Humans", "Dogs", "Snakes"], ... } - >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) + >>> df = pd.DataFrame( + ... data, columns=["class", "diet", "species"] + ... ) >>> df = df.set_index(["class", "diet"]) >>> df species @@ -8586,9 +8820,11 @@ def combine( -------- Combine using a simple function that chooses the smaller column. - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) - >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> take_smaller = ( + ... lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + ... ) >>> df1.combine(df2, take_smaller) A B 0 0 3 @@ -8596,8 +8832,8 @@ def combine( Example using a true element-wise combine function. - >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [5, 0], "B": [2, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine(df2, np.minimum) A B 0 1 2 @@ -8606,8 +8842,8 @@ def combine( Using `fill_value` fills Nones prior to passing the column to the merge function. - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 @@ -8616,8 +8852,8 @@ def combine( However, if the same element in both dataframes is None, that None is preserved - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [None, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 @@ -8626,8 +8862,14 @@ def combine( Example that demonstrates the use of `overwrite` and behavior when the axis differ between the dataframes. - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) + >>> df2 = pd.DataFrame( + ... { + ... "B": [3, 3], + ... "C": [-10, 1], + ... }, + ... index=[1, 2], + ... ) >>> df1.combine(df2, take_smaller) A B C 0 NaN NaN NaN @@ -8642,7 +8884,13 @@ def combine( Demonstrating the preference of the passed in dataframe. - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) + >>> df2 = pd.DataFrame( + ... { + ... "B": [3, 3], + ... "C": [1, 1], + ... }, + ... index=[1, 2], + ... ) >>> df2.combine(df1, take_smaller) A B C 0 0.0 NaN NaN @@ -8752,8 +9000,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Examples -------- - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine_first(df2) A B 0 1.0 3.0 @@ -8762,8 +9010,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Null values still persist if the location of that null value does not exist in `other` - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) + >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) + >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2) A B C 0 NaN 4.0 NaN @@ -8862,10 +9110,8 @@ def update( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, 5, 6], - ... 'C': [7, 8, 9]}) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]}) + >>> new_df = pd.DataFrame({"B": [4, 5, 6], "C": [7, 8, 9]}) >>> df.update(new_df) >>> df A B @@ -8876,9 +9122,10 @@ def update( The DataFrame's length does not increase as a result of the update, only values at matching index/column labels are updated. - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df = pd.DataFrame( + ... {"A": ["a", "b", "c"], "B": ["x", "y", "z"]} + ... ) + >>> new_df = pd.DataFrame({"B": ["d", "e", "f", "g", "h", "i"]}) >>> df.update(new_df) >>> df A B @@ -8886,9 +9133,10 @@ def update( 1 b e 2 c f - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2]) + >>> df = pd.DataFrame( + ... {"A": ["a", "b", "c"], "B": ["x", "y", "z"]} + ... ) + >>> new_df = pd.DataFrame({"B": ["d", "f"]}, index=[0, 2]) >>> df.update(new_df) >>> df A B @@ -8898,9 +9146,10 @@ def update( For Series, its name attribute must be set. - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_column = pd.Series(['d', 'e', 'f'], name='B') + >>> df = pd.DataFrame( + ... {"A": ["a", "b", "c"], "B": ["x", "y", "z"]} + ... ) + >>> new_column = pd.Series(["d", "e", "f"], name="B") >>> df.update(new_column) >>> df A B @@ -8911,9 +9160,10 @@ def update( If `other` contains NaNs the corresponding values are not updated in the original dataframe. - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400., 500., 600.]}) - >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3], "B": [400.0, 500.0, 600.0]} + ... ) + >>> new_df = pd.DataFrame({"B": [4, np.nan, 6]}) >>> df.update(new_df) >>> df A B @@ -9127,9 +9377,7 @@ def groupby( dropna=dropna, ) - _shared_docs[ - "pivot" - ] = """ + _shared_docs["pivot"] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -9273,9 +9521,7 @@ def pivot( return pivot(self, index=index, columns=columns, values=values) - _shared_docs[ - "pivot_table" - ] = """ + _shared_docs["pivot_table"] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects @@ -9523,9 +9769,11 @@ def stack( -------- **Single level columns** - >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], - ... index=['cat', 'dog'], - ... columns=['weight', 'height']) + >>> df_single_level_cols = pd.DataFrame( + ... [[0, 1], [2, 3]], + ... index=["cat", "dog"], + ... columns=["weight", "height"], + ... ) Stacking a dataframe with a single level column axis returns a Series: @@ -9542,11 +9790,14 @@ def stack( **Multi level columns: simple case** - >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('weight', 'pounds')]) - >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], - ... index=['cat', 'dog'], - ... columns=multicol1) + >>> multicol1 = pd.MultiIndex.from_tuples( + ... [("weight", "kg"), ("weight", "pounds")] + ... ) + >>> df_multi_level_cols1 = pd.DataFrame( + ... [[1, 2], [2, 4]], + ... index=["cat", "dog"], + ... columns=multicol1, + ... ) Stacking a dataframe with a multi-level column axis: @@ -9564,11 +9815,14 @@ def stack( **Missing values** - >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('height', 'm')]) - >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], - ... index=['cat', 'dog'], - ... columns=multicol2) + >>> multicol2 = pd.MultiIndex.from_tuples( + ... [("weight", "kg"), ("height", "m")] + ... ) + >>> df_multi_level_cols2 = pd.DataFrame( + ... [[1.0, 2.0], [3.0, 4.0]], + ... index=["cat", "dog"], + ... columns=multicol2, + ... ) It is common to have missing values when stacking a dataframe with multi-level columns, as the stacked dataframe typically @@ -9722,9 +9976,13 @@ def explode( Examples -------- - >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], - ... 'B': 1, - ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df = pd.DataFrame( + ... { + ... "A": [[0, 1, 2], "foo", [], [3, 4]], + ... "B": 1, + ... "C": [["a", "b", "c"], np.nan, [], ["d", "e"]], + ... } + ... ) >>> df A B C 0 [0, 1, 2] 1 [a, b, c] @@ -9734,7 +9992,7 @@ def explode( Single-column explode. - >>> df.explode('A') + >>> df.explode("A") A B C 0 0 1 [a, b, c] 0 1 1 [a, b, c] @@ -9746,7 +10004,7 @@ def explode( Multi-column explode. - >>> df.explode(list('AC')) + >>> df.explode(list("AC")) A B C 0 0 1 a 0 1 1 b @@ -9830,8 +10088,9 @@ def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): Examples -------- - >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), - ... ('two', 'a'), ('two', 'b')]) + >>> index = pd.MultiIndex.from_tuples( + ... [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")] + ... ) >>> s = pd.Series(np.arange(1.0, 5.0), index=index) >>> s one a 1.0 @@ -10225,7 +10484,7 @@ def apply( Examples -------- - >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) >>> df A B 0 4 9 @@ -10265,7 +10524,7 @@ def apply( Passing ``result_type='expand'`` will expand list-like results to columns of a Dataframe - >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + >>> df.apply(lambda x: [1, 2], axis=1, result_type="expand") 0 1 0 1 2 1 1 2 @@ -10275,7 +10534,10 @@ def apply( ``result_type='expand'``. The resulting column names will be the Series index. - >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) + >>> df.apply( + ... lambda x: pd.Series([1, 2], index=["foo", "bar"]), + ... axis=1, + ... ) foo bar 0 1 2 1 1 2 @@ -10286,7 +10548,7 @@ def apply( and broadcast it along the axis. The resulting column names will be the originals. - >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + >>> df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") A B 0 1 2 1 1 2 @@ -10359,7 +10621,7 @@ def map( >>> df_copy = df.copy() >>> df_copy.iloc[0, 0] = pd.NA - >>> df_copy.map(lambda x: len(str(x)), na_action='ignore') + >>> df_copy.map(lambda x: len(str(x)), na_action="ignore") 0 1 0 NaN 4 1 5.0 5 @@ -10382,7 +10644,7 @@ def map( But it's better to avoid map in that case. - >>> df ** 2 + >>> df**2 0 1 0 1.000000 4.494400 1 11.262736 20.857489 @@ -10587,8 +10849,12 @@ def join( Examples -------- - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + >>> df = pd.DataFrame( + ... { + ... "key": ["K0", "K1", "K2", "K3", "K4", "K5"], + ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], + ... } + ... ) >>> df key A @@ -10599,8 +10865,9 @@ def join( 4 K4 A4 5 K5 A5 - >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], - ... 'B': ['B0', 'B1', 'B2']}) + >>> other = pd.DataFrame( + ... {"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]} + ... ) >>> other key B @@ -10610,7 +10877,7 @@ def join( Join DataFrames using their indexes. - >>> df.join(other, lsuffix='_caller', rsuffix='_other') + >>> df.join(other, lsuffix="_caller", rsuffix="_other") key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 @@ -10623,7 +10890,7 @@ def join( the index in both `df` and `other`. The joined DataFrame will have key as its index. - >>> df.set_index('key').join(other.set_index('key')) + >>> df.set_index("key").join(other.set_index("key")) A B key K0 A0 B0 @@ -10638,7 +10905,7 @@ def join( any column in `df`. This method preserves the original DataFrame's index in the result. - >>> df.join(other.set_index('key'), on='key') + >>> df.join(other.set_index("key"), on="key") key A B 0 K0 A0 B0 1 K1 A1 B1 @@ -10649,8 +10916,12 @@ def join( Using non-unique key values shows how they are matched. - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + >>> df = pd.DataFrame( + ... { + ... "key": ["K0", "K1", "K1", "K3", "K0", "K1"], + ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], + ... } + ... ) >>> df key A @@ -10661,7 +10932,7 @@ def join( 4 K0 A4 5 K1 A5 - >>> df.join(other.set_index('key'), on='key', validate='m:1') + >>> df.join(other.set_index("key"), on="key", validate="m:1") key A B 0 K0 A0 B0 1 K1 A1 B1 @@ -10817,8 +11088,15 @@ def round( Examples -------- - >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [ + ... (0.21, 0.32), + ... (0.01, 0.67), + ... (0.66, 0.03), + ... (0.21, 0.18), + ... ], + ... columns=["dogs", "cats"], + ... ) >>> df dogs cats 0 0.21 0.32 @@ -10840,7 +11118,7 @@ def round( specified with the column names as key and the number of decimal places as value - >>> df.round({'dogs': 1, 'cats': 0}) + >>> df.round({"dogs": 1, "cats": 0}) dogs cats 0 0.2 0.0 1 0.0 1.0 @@ -10851,7 +11129,7 @@ def round( specified with the column names as index and the number of decimal places as value - >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> decimals = pd.Series([0, 1], index=["cats", "dogs"]) >>> df.round(decimals) dogs cats 0 0.2 0.0 @@ -10964,15 +11242,19 @@ def corr( >>> def histogram_intersection(a, b): ... v = np.minimum(a, b).sum().round(decimals=1) ... return v - >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], + ... columns=["dogs", "cats"], + ... ) >>> df.corr(method=histogram_intersection) dogs cats dogs 1.0 0.3 cats 0.3 1.0 - >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], + ... columns=["dogs", "cats"], + ... ) >>> df.corr(min_periods=3) dogs cats dogs 1.0 NaN @@ -11098,16 +11380,20 @@ def cov( Examples -------- - >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=["dogs", "cats"], + ... ) >>> df.cov() dogs cats dogs 0.666667 -1.000000 cats -1.000000 1.666667 >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(1000, 5), - ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df = pd.DataFrame( + ... np.random.randn(1000, 5), + ... columns=["a", "b", "c", "d", "e"], + ... ) >>> df.cov() a b c d e a 0.998438 -0.020161 0.059277 -0.008943 0.014144 @@ -11123,10 +11409,11 @@ def cov( each column pair in order to have a valid result: >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(20, 3), - ... columns=['a', 'b', 'c']) - >>> df.loc[df.index[:5], 'a'] = np.nan - >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df = pd.DataFrame( + ... np.random.randn(20, 3), columns=["a", "b", "c"] + ... ) + >>> df.loc[df.index[:5], "a"] = np.nan + >>> df.loc[df.index[5:10], "b"] = np.nan >>> df.cov(min_periods=12) a b c a 0.316741 NaN -0.150812 @@ -11206,8 +11493,16 @@ def corrwith( -------- >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] - >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) - >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) + >>> df1 = pd.DataFrame( + ... np.arange(20).reshape(5, 4), + ... index=index, + ... columns=columns, + ... ) + >>> df2 = pd.DataFrame( + ... np.arange(16).reshape(4, 4), + ... index=index[:4], + ... columns=columns, + ... ) >>> df1.corrwith(df2) one 1.0 two 1.0 @@ -11222,7 +11517,7 @@ def corrwith( d 1.0 e NaN dtype: float64 - """ # noqa: E501 + """ axis = self._get_axis_number(axis) this = self._get_numeric_data() if numeric_only else self @@ -11322,10 +11617,13 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): -------- Constructing DataFrame from a dictionary: - >>> df = pd.DataFrame({"Person": - ... ["John", "Myla", "Lewis", "John", "Myla"], - ... "Age": [24., np.nan, 21., 33, 26], - ... "Single": [False, True, True, True, False]}) + >>> df = pd.DataFrame( + ... { + ... "Person": ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24.0, np.nan, 21.0, 33, 26], + ... "Single": [False, True, True, True, False], + ... } + ... ) >>> df Person Age Single 0 John 24.0 False @@ -11344,7 +11642,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): Counts for each **row**: - >>> df.count(axis='columns') + >>> df.count(axis="columns") 0 3 1 2 2 3 @@ -11756,7 +12054,7 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: Examples -------- - >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df = pd.DataFrame({"A": [4, 5, 6], "B": [4, 1, 1]}) >>> df.nunique() A 3 B 2 @@ -11889,12 +12187,16 @@ def mode( Examples -------- - >>> df = pd.DataFrame([('bird', 2, 2), - ... ('mammal', 4, np.nan), - ... ('arthropod', 8, 0), - ... ('bird', 2, np.nan)], - ... index=('falcon', 'horse', 'spider', 'ostrich'), - ... columns=('species', 'legs', 'wings')) + >>> df = pd.DataFrame( + ... [ + ... ("bird", 2, 2), + ... ("mammal", 4, np.nan), + ... ("arthropod", 8, 0), + ... ("bird", 2, np.nan), + ... ], + ... index=("falcon", "horse", "spider", "ostrich"), + ... columns=("species", "legs", "wings"), + ... ) >>> df species legs wings falcon bird 2 2.0 @@ -11928,7 +12230,7 @@ def mode( To compute the mode over columns and not rows, use the axis parameter: - >>> df.mode(axis='columns', numeric_only=True) + >>> df.mode(axis="columns", numeric_only=True) 0 1 falcon 2.0 NaN horse 4.0 NaN @@ -12035,24 +12337,28 @@ def quantile( Examples -------- - >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), - ... columns=['a', 'b']) - >>> df.quantile(.1) + >>> df = pd.DataFrame( + ... np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=["a", "b"], + ... ) + >>> df.quantile(0.1) a 1.3 b 3.7 Name: 0.1, dtype: float64 - >>> df.quantile([.1, .5]) + >>> df.quantile([0.1, 0.5]) a b 0.1 1.3 3.7 0.5 2.5 55.0 Specifying `method='table'` will compute the quantile over all columns. - >>> df.quantile(.1, method="table", interpolation="nearest") + >>> df.quantile(0.1, method="table", interpolation="nearest") a 1 b 1 Name: 0.1, dtype: int64 - >>> df.quantile([.1, .5], method="table", interpolation="nearest") + >>> df.quantile( + ... [0.1, 0.5], method="table", interpolation="nearest" + ... ) a b 0.1 1 1 0.5 3 100 @@ -12060,11 +12366,16 @@ def quantile( Specifying `numeric_only=False` will also compute the quantile of datetime and timedelta data. - >>> df = pd.DataFrame({'A': [1, 2], - ... 'B': [pd.Timestamp('2010'), - ... pd.Timestamp('2011')], - ... 'C': [pd.Timedelta('1 days'), - ... pd.Timedelta('2 days')]}) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [pd.Timestamp("2010"), pd.Timestamp("2011")], + ... "C": [ + ... pd.Timedelta("1 days"), + ... pd.Timedelta("2 days"), + ... ], + ... } + ... ) >>> df.quantile(0.5, numeric_only=False) A 1.5 B 2010-07-02 12:00:00 @@ -12196,8 +12507,8 @@ def to_timestamp( Examples -------- - >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y') - >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> idx = pd.PeriodIndex(["2023", "2024"], freq="Y") + >>> d = {"col1": [1, 2], "col2": [3, 4]} >>> df1 = pd.DataFrame(data=d, index=idx) >>> df1 col1 col2 @@ -12217,7 +12528,7 @@ def to_timestamp( Using `freq` which is the offset that the Timestamps will have >>> df2 = pd.DataFrame(data=d, index=idx) - >>> df2 = df2.to_timestamp(freq='M') + >>> df2 = df2.to_timestamp(freq="M") >>> df2 col1 col2 2023-01-31 1 3 @@ -12334,8 +12645,10 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, - ... index=['falcon', 'dog']) + >>> df = pd.DataFrame( + ... {"num_legs": [2, 4], "num_wings": [2, 0]}, + ... index=["falcon", "dog"], + ... ) >>> df num_legs num_wings falcon 2 2 @@ -12359,7 +12672,7 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: When ``values`` is a dict, we can pass values to check for each column separately: - >>> df.isin({'num_wings': [0, 3]}) + >>> df.isin({"num_wings": [0, 3]}) num_legs num_wings falcon False False dog False True @@ -12368,8 +12681,10 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: match. Note that 'falcon' does not match based on the number of legs in other. - >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, - ... index=['spider', 'falcon']) + >>> other = pd.DataFrame( + ... {"num_legs": [8, 3], "num_wings": [0, 2]}, + ... index=["spider", "falcon"], + ... ) >>> df.isin(other) num_legs num_wings falcon False True @@ -12519,7 +12834,7 @@ def _to_dict_of_blocks(self): mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v, in mgr.to_dict().items() + for k, v in mgr.to_dict().items() } @property @@ -12562,9 +12877,13 @@ def values(self) -> np.ndarray: A DataFrame where all columns are the same type (e.g., int64) results in an array of the same type. - >>> df = pd.DataFrame({'age': [ 3, 29], - ... 'height': [94, 170], - ... 'weight': [31, 115]}) + >>> df = pd.DataFrame( + ... { + ... "age": [3, 29], + ... "height": [94, 170], + ... "weight": [31, 115], + ... } + ... ) >>> df age height weight 0 3 94 31 @@ -12582,10 +12901,14 @@ def values(self) -> np.ndarray: results in an ndarray of the broadest type that accommodates these mixed types (e.g., object). - >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), - ... ('lion', 80.5, 1), - ... ('monkey', np.nan, None)], - ... columns=('name', 'max_speed', 'rank')) + >>> df2 = pd.DataFrame( + ... [ + ... ("parrot", 24.0, "second"), + ... ("lion", 80.5, 1), + ... ("monkey", np.nan, None), + ... ], + ... columns=("name", "max_speed", "rank"), + ... ) >>> df2.dtypes name object max_speed float64 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 22ac3f44d0d56..71128d6888bb2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -390,7 +390,7 @@ def attrs(self) -> dict[Hashable, Any]: For DataFrame: - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.attrs = {"A": [10, 20, 30]} >>> df.attrs {'A': [10, 20, 30]} @@ -706,11 +706,11 @@ def ndim(self) -> int: Examples -------- - >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s = pd.Series({"a": 1, "b": 2, "c": 3}) >>> s.ndim 1 - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.ndim 2 """ @@ -731,11 +731,11 @@ def size(self) -> int: Examples -------- - >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s = pd.Series({"a": 1, "b": 2, "c": 3}) >>> s.size 3 - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.size 4 """ @@ -905,15 +905,17 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: Examples -------- - >>> df = pd.DataFrame([ - ... [1, 2, 3, 4], - ... [5, 6, 7, 8], - ... [9, 10, 11, 12] - ... ]).set_index([0, 1]).rename_axis(['a', 'b']) + >>> df = ( + ... pd.DataFrame( + ... [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]] + ... ) + ... .set_index([0, 1]) + ... .rename_axis(["a", "b"]) + ... ) - >>> df.columns = pd.MultiIndex.from_tuples([ - ... ('c', 'e'), ('d', 'f') - ... ], names=['level_1', 'level_2']) + >>> df.columns = pd.MultiIndex.from_tuples( + ... [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ... ) >>> df level_1 c d @@ -923,7 +925,7 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: 5 6 7 8 9 10 11 12 - >>> df.droplevel('a') + >>> df.droplevel("a") level_1 c d level_2 e f b @@ -931,7 +933,7 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: 6 7 8 10 11 12 - >>> df.droplevel('level_2', axis=1) + >>> df.droplevel("level_2", axis=1) level_1 c d a b 1 2 3 4 @@ -1011,7 +1013,7 @@ def squeeze(self, axis: Axis | None = None): Squeezing is even more effective when used with DataFrames. - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) >>> df a b 0 1 2 @@ -1020,7 +1022,7 @@ def squeeze(self, axis: Axis | None = None): Slicing a single column will produce a DataFrame with the columns having only one value: - >>> df_a = df[['a']] + >>> df_a = df[["a"]] >>> df_a a 0 1 @@ -1028,7 +1030,7 @@ def squeeze(self, axis: Axis | None = None): So the columns can be squeezed down, resulting in a Series: - >>> df_a.squeeze('columns') + >>> df_a.squeeze("columns") 0 1 1 3 Name: a, dtype: int64 @@ -1036,14 +1038,14 @@ def squeeze(self, axis: Axis | None = None): Slicing a single row from a single column will produce a single scalar DataFrame: - >>> df_0a = df.loc[df.index < 1, ['a']] + >>> df_0a = df.loc[df.index < 1, ["a"]] >>> df_0a a 0 1 Squeezing the rows produces a single scalar Series: - >>> df_0a.squeeze('rows') + >>> df_0a.squeeze("rows") a 1 Name: 0, dtype: int64 @@ -1273,9 +1275,10 @@ def rename_axis( **DataFrame** - >>> df = pd.DataFrame({"num_legs": [4, 4, 2], - ... "num_arms": [0, 0, 2]}, - ... ["dog", "cat", "monkey"]) + >>> df = pd.DataFrame( + ... {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]}, + ... ["dog", "cat", "monkey"], + ... ) >>> df num_legs num_arms dog 4 0 @@ -1298,9 +1301,10 @@ def rename_axis( **MultiIndex** - >>> df.index = pd.MultiIndex.from_product([['mammal'], - ... ['dog', 'cat', 'monkey']], - ... names=['type', 'name']) + >>> df.index = pd.MultiIndex.from_product( + ... [["mammal"], ["dog", "cat", "monkey"]], + ... names=["type", "name"], + ... ) >>> df limbs num_legs num_arms type name @@ -1308,7 +1312,7 @@ def rename_axis( cat 4 0 monkey 2 2 - >>> df.rename_axis(index={'type': 'class'}) + >>> df.rename_axis(index={"type": "class"}) limbs num_legs num_arms class name mammal dog 4 0 @@ -1397,8 +1401,9 @@ def _set_axis_name( Examples -------- - >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, - ... ["dog", "cat", "monkey"]) + >>> df = pd.DataFrame( + ... {"num_legs": [4, 4, 2]}, ["dog", "cat", "monkey"] + ... ) >>> df num_legs dog 4 @@ -1411,7 +1416,8 @@ def _set_axis_name( cat 4 monkey 2 >>> df.index = pd.MultiIndex.from_product( - ... [["mammal"], ['dog', 'cat', 'monkey']]) + ... [["mammal"], ["dog", "cat", "monkey"]] + ... ) >>> df._set_axis_name(["type", "name"]) num_legs type name @@ -1614,9 +1620,9 @@ def bool(self) -> bool_t: >>> pd.Series([False]).bool() # doctest: +SKIP False - >>> pd.DataFrame({'col': [True]}).bool() # doctest: +SKIP + >>> pd.DataFrame({"col": [True]}).bool() # doctest: +SKIP True - >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP + >>> pd.DataFrame({"col": [False]}).bool() # doctest: +SKIP False This is an alternative method and will only work @@ -1689,7 +1695,7 @@ def abs(self) -> Self: Absolute numeric values in a Series with a Timedelta element. - >>> s = pd.Series([pd.Timedelta('1 days')]) + >>> s = pd.Series([pd.Timedelta("1 days")]) >>> s.abs() 0 1 days dtype: timedelta64[ns] @@ -1697,11 +1703,13 @@ def abs(self) -> Self: Select rows with data closest to certain value using argsort (from `StackOverflow `__). - >>> df = pd.DataFrame({ - ... 'a': [4, 5, 6, 7], - ... 'b': [10, 20, 30, 40], - ... 'c': [100, 50, -30, -50] - ... }) + >>> df = pd.DataFrame( + ... { + ... "a": [4, 5, 6, 7], + ... "b": [10, 20, 30, 40], + ... "c": [100, 50, -30, -50], + ... } + ... ) >>> df a b c 0 4 10 100 @@ -2022,7 +2030,7 @@ def __iter__(self) -> Iterator: Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> for x in df: ... print(x) A @@ -2044,8 +2052,10 @@ def keys(self) -> Index: Examples -------- - >>> d = pd.DataFrame(data={'A': [1, 2, 3], 'B': [0, 4, 8]}, - ... index=['a', 'b', 'c']) + >>> d = pd.DataFrame( + ... data={"A": [1, 2, 3], "B": [0, 4, 8]}, + ... index=["a", "b", "c"], + ... ) >>> d A B a 1 0 @@ -2106,7 +2116,7 @@ def empty(self) -> bool_t: -------- An example of an actual empty DataFrame. Notice the index is empty: - >>> df_empty = pd.DataFrame({'A' : []}) + >>> df_empty = pd.DataFrame({"A": []}) >>> df_empty Empty DataFrame Columns: [A] @@ -2117,7 +2127,7 @@ def empty(self) -> bool_t: If we only have NaNs in our DataFrame, it is not considered empty! We will need to drop the NaNs to make the DataFrame empty: - >>> df = pd.DataFrame({'A' : [np.nan]}) + >>> df = pd.DataFrame({"A": [np.nan]}) >>> df A 0 NaN @@ -2126,7 +2136,7 @@ def empty(self) -> bool_t: >>> df.dropna().empty True - >>> ser_empty = pd.Series({'A' : []}) + >>> ser_empty = pd.Series({"A": []}) >>> ser_empty A [] dtype: object @@ -2363,35 +2373,43 @@ def to_excel( Create, write to and save a workbook: - >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) + >>> df1 = pd.DataFrame( + ... [["a", "b"], ["c", "d"]], + ... index=["row 1", "row 2"], + ... columns=["col 1", "col 2"], + ... ) >>> df1.to_excel("output.xlsx") # doctest: +SKIP To specify the sheet name: - >>> df1.to_excel("output.xlsx", - ... sheet_name='Sheet_name_1') # doctest: +SKIP + >>> df1.to_excel( + ... "output.xlsx", sheet_name="Sheet_name_1" + ... ) # doctest: +SKIP If you wish to write to more than one sheet in the workbook, it is necessary to specify an ExcelWriter object: >>> df2 = df1.copy() - >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP - ... df1.to_excel(writer, sheet_name='Sheet_name_1') - ... df2.to_excel(writer, sheet_name='Sheet_name_2') + >>> with pd.ExcelWriter( + ... "output.xlsx" + ... ) as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name="Sheet_name_1") + ... df2.to_excel(writer, sheet_name="Sheet_name_2") ExcelWriter can also be used to append to an existing Excel file: - >>> with pd.ExcelWriter('output.xlsx', - ... mode='a') as writer: # doctest: +SKIP - ... df1.to_excel(writer, sheet_name='Sheet_name_3') + >>> with pd.ExcelWriter( + ... "output.xlsx", mode="a" + ... ) as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name="Sheet_name_3") To set the library that is used to write the Excel file, you can pass the `engine` keyword (the default engine is automatically chosen depending on the file extension): - >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP + >>> df1.to_excel( + ... "output1.xlsx", engine="xlsxwriter" + ... ) # doctest: +SKIP """ if engine_kwargs is None: engine_kwargs = {} @@ -2822,23 +2840,24 @@ def to_hdf( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - ... index=['a', 'b', 'c']) # doctest: +SKIP - >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"] + ... ) # doctest: +SKIP + >>> df.to_hdf("data.h5", key="df", mode="w") # doctest: +SKIP We can add another object to the same file: >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP - >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP + >>> s.to_hdf("data.h5", key="s") # doctest: +SKIP Reading from HDF file: - >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP + >>> pd.read_hdf("data.h5", "df") # doctest: +SKIP A B a 1 4 b 2 5 c 3 6 - >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP + >>> pd.read_hdf("data.h5", "s") # doctest: +SKIP 0 1 1 2 2 3 @@ -3135,7 +3154,9 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df = pd.DataFrame( + ... {{"foo": range(5), "bar": range(5, 10)}} + ... ) # doctest: +SKIP >>> original_df # doctest: +SKIP foo bar 0 0 5 @@ -3145,7 +3166,9 @@ def to_pickle( 4 4 9 >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP - >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df = pd.read_pickle( + ... "./dummy.pkl" + ... ) # doctest: +SKIP >>> unpickled_df # doctest: +SKIP foo bar 0 0 5 @@ -3153,7 +3176,7 @@ def to_pickle( 2 2 7 3 3 8 4 4 9 - """ # noqa: E501 + """ from pandas.io.pickle import to_pickle to_pickle( @@ -3211,9 +3234,11 @@ def to_clipboard( -------- Copy the contents of a DataFrame to the clipboard. - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... [[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"] + ... ) - >>> df.to_clipboard(sep=',') # doctest: +SKIP + >>> df.to_clipboard(sep=",") # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # ,A,B,C ... # 0,1,2,3 @@ -3222,7 +3247,7 @@ def to_clipboard( We can omit the index by passing the keyword `index` and setting it to false. - >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP + >>> df.to_clipboard(sep=",", index=False) # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # A,B,C ... # 1,2,3 @@ -3233,6 +3258,7 @@ def to_clipboard( .. code-block:: python import pyperclip + html = df.style.to_html() pyperclip.copy(html) """ @@ -3262,12 +3288,15 @@ def to_xarray(self): Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2), - ... ('parrot', 'bird', 24.0, 2), - ... ('lion', 'mammal', 80.5, 4), - ... ('monkey', 'mammal', np.nan, 4)], - ... columns=['name', 'class', 'max_speed', - ... 'num_legs']) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0, 2), + ... ("parrot", "bird", 24.0, 2), + ... ("lion", "mammal", 80.5, 4), + ... ("monkey", "mammal", np.nan, 4), + ... ], + ... columns=["name", "class", "max_speed", "num_legs"], + ... ) >>> df name class max_speed num_legs 0 falcon bird 389.0 2 @@ -3286,19 +3315,23 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' max_speed (index) float64 389.0 24.0 80.5 nan num_legs (index) int64 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df["max_speed"].to_xarray() array([389. , 24. , 80.5, nan]) Coordinates: * index (index) int64 0 1 2 3 - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01', - ... '2018-01-02', '2018-01-02']) - >>> df_multiindex = pd.DataFrame({'date': dates, - ... 'animal': ['falcon', 'parrot', - ... 'falcon', 'parrot'], - ... 'speed': [350, 18, 361, 15]}) - >>> df_multiindex = df_multiindex.set_index(['date', 'animal']) + >>> dates = pd.to_datetime( + ... ["2018-01-01", "2018-01-01", "2018-01-02", "2018-01-02"] + ... ) + >>> df_multiindex = pd.DataFrame( + ... { + ... "date": dates, + ... "animal": ["falcon", "parrot", "falcon", "parrot"], + ... "speed": [350, 18, 361, 15], + ... } + ... ) + >>> df_multiindex = df_multiindex.set_index(["date", "animal"]) >>> df_multiindex speed @@ -3921,31 +3954,45 @@ def to_csv( -------- Create 'out.csv' containing 'df' without indices - >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], - ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}}) - >>> df.to_csv('out.csv', index=False) # doctest: +SKIP + >>> df = pd.DataFrame( + ... { + ... { + ... "name": ["Raphael", "Donatello"], + ... "mask": ["red", "purple"], + ... "weapon": ["sai", "bo staff"], + ... } + ... } + ... ) + >>> df.to_csv("out.csv", index=False) # doctest: +SKIP Create 'out.zip' containing 'out.csv' >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' - >>> compression_opts = dict(method='zip', - ... archive_name='out.csv') # doctest: +SKIP - >>> df.to_csv('out.zip', index=False, - ... compression=compression_opts) # doctest: +SKIP + >>> compression_opts = dict( + ... method="zip", archive_name="out.csv" + ... ) # doctest: +SKIP + >>> df.to_csv( + ... "out.zip", index=False, compression=compression_opts + ... ) # doctest: +SKIP To write a csv file to a new folder or nested folder you will first need to create it using either Pathlib or os: >>> from pathlib import Path # doctest: +SKIP - >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP - >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP + >>> filepath = Path( + ... "folder/subfolder/out.csv" + ... ) # doctest: +SKIP + >>> filepath.parent.mkdir( + ... parents=True, exist_ok=True + ... ) # doctest: +SKIP >>> df.to_csv(filepath) # doctest: +SKIP >>> import os # doctest: +SKIP - >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP - >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP + >>> os.makedirs( + ... "folder/subfolder", exist_ok=True + ... ) # doctest: +SKIP + >>> df.to_csv("folder/subfolder/out.csv") # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -4052,12 +4099,16 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Self: Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], - ... columns=['name', 'class', 'max_speed'], - ... index=[0, 2, 3, 1]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ], + ... columns=["name", "class", "max_speed"], + ... index=[0, 2, 3, 1], + ... ) >>> df name class max_speed 0 falcon bird 389.0 @@ -4200,13 +4251,15 @@ def xs( Examples -------- - >>> d = {'num_legs': [4, 4, 2, 2], - ... 'num_wings': [0, 0, 2, 2], - ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], - ... 'animal': ['cat', 'dog', 'bat', 'penguin'], - ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> d = { + ... "num_legs": [4, 4, 2, 2], + ... "num_wings": [0, 0, 2, 2], + ... "class": ["mammal", "mammal", "mammal", "bird"], + ... "animal": ["cat", "dog", "bat", "penguin"], + ... "locomotion": ["walks", "walks", "flies", "walks"], + ... } >>> df = pd.DataFrame(data=d) - >>> df = df.set_index(['class', 'animal', 'locomotion']) + >>> df = df.set_index(["class", "animal", "locomotion"]) >>> df num_legs num_wings class animal locomotion @@ -4217,7 +4270,7 @@ class animal locomotion Get values at specified index - >>> df.xs('mammal') + >>> df.xs("mammal") num_legs num_wings animal locomotion cat walks 4 0 @@ -4226,29 +4279,28 @@ class animal locomotion Get values at several indexes - >>> df.xs(('mammal', 'dog', 'walks')) + >>> df.xs(("mammal", "dog", "walks")) num_legs 4 num_wings 0 Name: (mammal, dog, walks), dtype: int64 Get values at specified index and level - >>> df.xs('cat', level=1) + >>> df.xs("cat", level=1) num_legs num_wings class locomotion mammal walks 4 0 Get values at several indexes and levels - >>> df.xs(('bird', 'walks'), - ... level=[0, 'locomotion']) + >>> df.xs(("bird", "walks"), level=[0, "locomotion"]) num_legs num_wings animal penguin 2 2 Get values at specified column and axis - >>> df.xs('num_wings', axis=1) + >>> df.xs("num_wings", axis=1) class animal locomotion mammal cat walks 0 dog walks 0 @@ -4541,8 +4593,14 @@ def get(self, key, default=None): ... [22, 71.6, "medium"], ... [35, 95, "medium"], ... ], - ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"], - ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), + ... columns=[ + ... "temp_celsius", + ... "temp_fahrenheit", + ... "windspeed", + ... ], + ... index=pd.date_range( + ... start="2014-02-12", end="2014-02-15", freq="D" + ... ), ... ) >>> df @@ -4559,16 +4617,18 @@ def get(self, key, default=None): 2014-02-14 22.0 medium 2014-02-15 35.0 medium - >>> ser = df['windspeed'] - >>> ser.get('2014-02-13') + >>> ser = df["windspeed"] + >>> ser.get("2014-02-13") 'high' If the key isn't found, the default value will be used. - >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value") + >>> df.get( + ... ["temp_celsius", "temp_kelvin"], default="default_value" + ... ) 'default_value' - >>> ser.get('2014-02-10', '[unknown]') + >>> ser.get("2014-02-10", "[unknown]") '[unknown]' """ try: @@ -4660,14 +4720,22 @@ def reindex_like( Examples -------- - >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'], - ... [31, 87.8, 'high'], - ... [22, 71.6, 'medium'], - ... [35, 95, 'medium']], - ... columns=['temp_celsius', 'temp_fahrenheit', - ... 'windspeed'], - ... index=pd.date_range(start='2014-02-12', - ... end='2014-02-15', freq='D')) + >>> df1 = pd.DataFrame( + ... [ + ... [24.3, 75.7, "high"], + ... [31, 87.8, "high"], + ... [22, 71.6, "medium"], + ... [35, 95, "medium"], + ... ], + ... columns=[ + ... "temp_celsius", + ... "temp_fahrenheit", + ... "windspeed", + ... ], + ... index=pd.date_range( + ... start="2014-02-12", end="2014-02-15", freq="D" + ... ), + ... ) >>> df1 temp_celsius temp_fahrenheit windspeed @@ -4676,12 +4744,13 @@ def reindex_like( 2014-02-14 22.0 71.6 medium 2014-02-15 35.0 95.0 medium - >>> df2 = pd.DataFrame([[28, 'low'], - ... [30, 'low'], - ... [35.1, 'medium']], - ... columns=['temp_celsius', 'windspeed'], - ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', - ... '2014-02-15'])) + >>> df2 = pd.DataFrame( + ... [[28, "low"], [30, "low"], [35.1, "medium"]], + ... columns=["temp_celsius", "windspeed"], + ... index=pd.DatetimeIndex( + ... ["2014-02-12", "2014-02-13", "2014-02-15"] + ... ), + ... ) >>> df2 temp_celsius windspeed @@ -4929,14 +4998,14 @@ def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: 3 4 dtype: int64 - >>> s.add_prefix('item_') + >>> s.add_prefix("item_") item_0 1 item_1 2 item_2 3 item_3 4 dtype: int64 - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) >>> df A B 0 1 3 @@ -4944,7 +5013,7 @@ def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: 2 3 5 3 4 6 - >>> df.add_prefix('col_') + >>> df.add_prefix("col_") col_A col_B 0 1 3 1 2 4 @@ -5003,14 +5072,14 @@ def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: 3 4 dtype: int64 - >>> s.add_suffix('_item') + >>> s.add_suffix("_item") 0_item 1 1_item 2 2_item 3 3_item 4 dtype: int64 - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) >>> df A B 0 1 3 @@ -5018,7 +5087,7 @@ def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: 2 3 5 3 4 6 - >>> df.add_suffix('_col') + >>> df.add_suffix("_col") A_col B_col 0 1 3 1 2 4 @@ -5135,12 +5204,14 @@ def sort_values( Examples -------- - >>> df = pd.DataFrame({ - ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], - ... 'col2': [2, 1, 9, 8, 7, 4], - ... 'col3': [0, 1, 9, 4, 2, 3], - ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "col1": ["A", "A", "B", np.nan, "D", "C"], + ... "col2": [2, 1, 9, 8, 7, 4], + ... "col3": [0, 1, 9, 4, 2, 3], + ... "col4": ["a", "B", "c", "D", "e", "F"], + ... } + ... ) >>> df col1 col2 col3 col4 0 A 2 0 a @@ -5152,7 +5223,7 @@ def sort_values( Sort by col1 - >>> df.sort_values(by=['col1']) + >>> df.sort_values(by=["col1"]) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -5163,7 +5234,7 @@ def sort_values( Sort by multiple columns - >>> df.sort_values(by=['col1', 'col2']) + >>> df.sort_values(by=["col1", "col2"]) col1 col2 col3 col4 1 A 1 1 B 0 A 2 0 a @@ -5174,7 +5245,7 @@ def sort_values( Sort Descending - >>> df.sort_values(by='col1', ascending=False) + >>> df.sort_values(by="col1", ascending=False) col1 col2 col3 col4 4 D 7 2 e 5 C 4 3 F @@ -5185,7 +5256,9 @@ def sort_values( Putting NAs first - >>> df.sort_values(by='col1', ascending=False, na_position='first') + >>> df.sort_values( + ... by="col1", ascending=False, na_position="first" + ... ) col1 col2 col3 col4 3 NaN 8 4 D 4 D 7 2 e @@ -5196,7 +5269,7 @@ def sort_values( Sorting with a key function - >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) + >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -5208,10 +5281,12 @@ def sort_values( Natural sort with the key argument, using the `natsort ` package. - >>> df = pd.DataFrame({ - ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], - ... "value": [10, 20, 30, 40, 50] - ... }) + >>> df = pd.DataFrame( + ... { + ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], + ... "value": [10, 20, 30, 40, 50], + ... } + ... ) >>> df time value 0 0hr 10 @@ -5222,7 +5297,7 @@ def sort_values( >>> from natsort import index_natsorted >>> df.sort_values( ... by="time", - ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... key=lambda x: np.argsort(index_natsorted(df["time"])), ... ) time value 0 0hr 10 @@ -5428,10 +5503,16 @@ def reindex( Create a dataframe with some fictional data. - >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301], - ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}}, - ... index=index) + >>> index = ["Firefox", "Chrome", "Safari", "IE10", "Konqueror"] + >>> df = pd.DataFrame( + ... { + ... { + ... "http_status": [200, 200, 404, 404, 301], + ... "response_time": [0.04, 0.02, 0.07, 0.08, 1.0], + ... } + ... }, + ... index=index, + ... ) >>> df http_status response_time Firefox 200 0.04 @@ -5444,8 +5525,13 @@ def reindex( values in the new index that do not have corresponding records in the dataframe are assigned ``NaN``. - >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', - ... 'Chrome'] + >>> new_index = [ + ... "Safari", + ... "Iceweasel", + ... "Comodo Dragon", + ... "IE10", + ... "Chrome", + ... ] >>> df.reindex(new_index) http_status response_time Safari 404.0 0.07 @@ -5467,7 +5553,7 @@ def reindex( IE10 404 0.08 Chrome 200 0.02 - >>> df.reindex(new_index, fill_value='missing') + >>> df.reindex(new_index, fill_value="missing") http_status response_time Safari 404 0.07 Iceweasel missing missing @@ -5477,7 +5563,7 @@ def reindex( We can also reindex the columns. - >>> df.reindex(columns=['http_status', 'user_agent']) + >>> df.reindex(columns=["http_status", "user_agent"]) http_status user_agent Firefox 200 NaN Chrome 200 NaN @@ -5487,7 +5573,7 @@ def reindex( Or we can use "axis-style" keyword arguments - >>> df.reindex(['http_status', 'user_agent'], axis="columns") + >>> df.reindex(["http_status", "user_agent"], axis="columns") http_status user_agent Firefox 200 NaN Chrome 200 NaN @@ -5500,9 +5586,11 @@ def reindex( monotonically increasing index (for example, a sequence of dates). - >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') - >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}}, - ... index=date_index) + >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D") + >>> df2 = pd.DataFrame( + ... {{"prices": [100, 101, np.nan, 100, 89, 88]}}, + ... index=date_index, + ... ) >>> df2 prices 2010-01-01 100.0 @@ -5515,7 +5603,9 @@ def reindex( Suppose we decide to expand the dataframe to cover a wider date range. - >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + >>> date_index2 = pd.date_range( + ... "12/29/2009", periods=10, freq="D" + ... ) >>> df2.reindex(date_index2) prices 2009-12-29 NaN @@ -5537,7 +5627,7 @@ def reindex( For example, to back-propagate the last valid value to fill the ``NaN`` values, pass ``bfill`` as an argument to the ``method`` keyword. - >>> df2.reindex(date_index2, method='bfill') + >>> df2.reindex(date_index2, method="bfill") prices 2009-12-29 100.0 2009-12-30 100.0 @@ -5746,28 +5836,30 @@ def filter( Examples -------- - >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), - ... index=['mouse', 'rabbit'], - ... columns=['one', 'two', 'three']) + >>> df = pd.DataFrame( + ... np.array(([1, 2, 3], [4, 5, 6])), + ... index=["mouse", "rabbit"], + ... columns=["one", "two", "three"], + ... ) >>> df one two three mouse 1 2 3 rabbit 4 5 6 >>> # select columns by name - >>> df.filter(items=['one', 'three']) + >>> df.filter(items=["one", "three"]) one three mouse 1 3 rabbit 4 6 >>> # select columns by regular expression - >>> df.filter(regex='e$', axis=1) + >>> df.filter(regex="e$", axis=1) one three mouse 1 3 rabbit 4 6 >>> # select rows containing 'bbi' - >>> df.filter(like='bbi', axis=0) + >>> df.filter(like="bbi", axis=0) one two three rabbit 4 5 6 """ @@ -5839,8 +5931,21 @@ def head(self, n: int = 5) -> Self: Examples -------- - >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', - ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df = pd.DataFrame( + ... { + ... "animal": [ + ... "alligator", + ... "bee", + ... "falcon", + ... "lion", + ... "monkey", + ... "parrot", + ... "shark", + ... "whale", + ... "zebra", + ... ] + ... } + ... ) >>> df animal 0 alligator @@ -5916,8 +6021,21 @@ def tail(self, n: int = 5) -> Self: Examples -------- - >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', - ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df = pd.DataFrame( + ... { + ... "animal": [ + ... "alligator", + ... "bee", + ... "falcon", + ... "lion", + ... "monkey", + ... "parrot", + ... "shark", + ... "whale", + ... "zebra", + ... ] + ... } + ... ) >>> df animal 0 alligator @@ -6042,10 +6160,14 @@ def sample( Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], - ... 'num_wings': [2, 0, 0, 0], - ... 'num_specimen_seen': [10, 2, 1, 8]}, - ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df = pd.DataFrame( + ... { + ... "num_legs": [2, 4, 8, 0], + ... "num_wings": [2, 0, 0, 0], + ... "num_specimen_seen": [10, 2, 1, 8], + ... }, + ... index=["falcon", "dog", "spider", "fish"], + ... ) >>> df num_legs num_wings num_specimen_seen falcon 2 2 10 @@ -6057,7 +6179,7 @@ def sample( Note that we use `random_state` to ensure the reproducibility of the examples. - >>> df['num_legs'].sample(n=3, random_state=1) + >>> df["num_legs"].sample(n=3, random_state=1) fish 0 spider 8 falcon 2 @@ -6087,7 +6209,7 @@ def sample( Using a DataFrame column as weights. Rows with larger value in the `num_specimen_seen` column are more likely to be sampled. - >>> df.sample(n=2, weights='num_specimen_seen', random_state=1) + >>> df.sample(n=2, weights="num_specimen_seen", random_state=1) num_legs num_wings num_specimen_seen falcon 2 2 10 fish 0 0 8 @@ -6162,7 +6284,7 @@ def pipe( Constructing a income DataFrame from a dictionary. >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] - >>> df = pd.DataFrame(data, columns=['Salary', 'Others']) + >>> df = pd.DataFrame(data, columns=["Salary", "Others"]) >>> df Salary Others 0 8000 1000.0 @@ -6184,14 +6306,19 @@ def pipe( >>> subtract_national_insurance( ... subtract_state_tax(subtract_federal_tax(df), rate=0.12), ... rate=0.05, - ... rate_increase=0.02) # doctest: +SKIP + ... rate_increase=0.02, + ... ) # doctest: +SKIP You can write >>> ( ... df.pipe(subtract_federal_tax) ... .pipe(subtract_state_tax, rate=0.12) - ... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02) + ... .pipe( + ... subtract_national_insurance, + ... rate=0.05, + ... rate_increase=0.02, + ... ) ... ) Salary Others 0 5892.48 736.56 @@ -6210,9 +6337,9 @@ def pipe( ... df.pipe(subtract_federal_tax) ... .pipe(subtract_state_tax, rate=0.12) ... .pipe( - ... (subtract_national_insurance, 'df'), + ... (subtract_national_insurance, "df"), ... rate=0.05, - ... rate_increase=0.02 + ... rate_increase=0.02, ... ) ... ) Salary Others @@ -6440,10 +6567,14 @@ def dtypes(self): Examples -------- - >>> df = pd.DataFrame({'float': [1.0], - ... 'int': [1], - ... 'datetime': [pd.Timestamp('20180310')], - ... 'string': ['foo']}) + >>> df = pd.DataFrame( + ... { + ... "float": [1.0], + ... "int": [1], + ... "datetime": [pd.Timestamp("20180310")], + ... "string": ["foo"], + ... } + ... ) >>> df.dtypes float float64 int int64 @@ -6514,7 +6645,7 @@ def astype( -------- Create a DataFrame: - >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> d = {"col1": [1, 2], "col2": [3, 4]} >>> df = pd.DataFrame(data=d) >>> df.dtypes col1 int64 @@ -6523,33 +6654,33 @@ def astype( Cast all columns to int32: - >>> df.astype('int32').dtypes + >>> df.astype("int32").dtypes col1 int32 col2 int32 dtype: object Cast col1 to int32 using a dictionary: - >>> df.astype({'col1': 'int32'}).dtypes + >>> df.astype({"col1": "int32"}).dtypes col1 int32 col2 int64 dtype: object Create a series: - >>> ser = pd.Series([1, 2], dtype='int32') + >>> ser = pd.Series([1, 2], dtype="int32") >>> ser 0 1 1 2 dtype: int32 - >>> ser.astype('int64') + >>> ser.astype("int64") 0 1 1 2 dtype: int64 Convert to categorical type: - >>> ser.astype('category') + >>> ser.astype("category") 0 1 1 2 dtype: category @@ -6559,7 +6690,8 @@ def astype( >>> from pandas.api.types import CategoricalDtype >>> cat_dtype = CategoricalDtype( - ... categories=[2, 1], ordered=True) + ... categories=[2, 1], ordered=True + ... ) >>> ser.astype(cat_dtype) 0 1 1 2 @@ -6568,7 +6700,7 @@ def astype( Create a series of dates: - >>> ser_date = pd.Series(pd.date_range('20200101', periods=3)) + >>> ser_date = pd.Series(pd.date_range("20200101", periods=3)) >>> ser_date 0 2020-01-01 1 2020-01-02 @@ -6961,11 +7093,21 @@ def convert_dtypes( >>> df = pd.DataFrame( ... { ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), - ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), - ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), - ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), - ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), - ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + ... "b": pd.Series( + ... ["x", "y", "z"], dtype=np.dtype("O") + ... ), + ... "c": pd.Series( + ... [True, False, np.nan], dtype=np.dtype("O") + ... ), + ... "d": pd.Series( + ... ["h", "i", np.nan], dtype=np.dtype("O") + ... ), + ... "e": pd.Series( + ... [10, np.nan, 20], dtype=np.dtype("float") + ... ), + ... "f": pd.Series( + ... [np.nan, 100.5, 200], dtype=np.dtype("float") + ... ), ... } ... ) @@ -7198,11 +7340,15 @@ def fillna( Examples -------- - >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], - ... [3, 4, np.nan, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [np.nan, 3, np.nan, 4]], - ... columns=list("ABCD")) + >>> df = pd.DataFrame( + ... [ + ... [np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4], + ... ], + ... columns=list("ABCD"), + ... ) >>> df A B C D 0 NaN 2.0 NaN 0.0 @@ -7510,11 +7656,15 @@ def ffill( Examples -------- - >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], - ... [3, 4, np.nan, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [np.nan, 3, np.nan, 4]], - ... columns=list("ABCD")) + >>> df = pd.DataFrame( + ... [ + ... [np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4], + ... ], + ... columns=list("ABCD"), + ... ) >>> df A B C D 0 NaN 2.0 NaN 0.0 @@ -7704,7 +7854,9 @@ def bfill( With DataFrame: - >>> df = pd.DataFrame({{'A': [1, None, None, 4], 'B': [None, 5, None, 7]}}) + >>> df = pd.DataFrame( + ... {{"A": [1, None, None, 4], "B": [None, 5, None, 7]}} + ... ) >>> df A B 0 1.0 NaN @@ -8283,7 +8435,7 @@ def interpolate( an ``order`` (int). >>> s = pd.Series([0, 2, np.nan, 8]) - >>> s.interpolate(method='polynomial', order=2) + >>> s.interpolate(method="polynomial", order=2) 0 0.000000 1 2.000000 2 4.666667 @@ -8298,18 +8450,24 @@ def interpolate( Note how the first entry in column 'b' remains ``NaN``, because there is no entry before it to use for interpolation. - >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), - ... (np.nan, 2.0, np.nan, np.nan), - ... (2.0, 3.0, np.nan, 9.0), - ... (np.nan, 4.0, -4.0, 16.0)], - ... columns=list('abcd')) + >>> df = pd.DataFrame( + ... [ + ... (0.0, np.nan, -1.0, 1.0), + ... (np.nan, 2.0, np.nan, np.nan), + ... (2.0, 3.0, np.nan, 9.0), + ... (np.nan, 4.0, -4.0, 16.0), + ... ], + ... columns=list("abcd"), + ... ) >>> df a b c d 0 0.0 NaN -1.0 1.0 1 NaN 2.0 NaN NaN 2 2.0 3.0 NaN 9.0 3 NaN 4.0 -4.0 16.0 - >>> df.interpolate(method='linear', limit_direction='forward', axis=0) + >>> df.interpolate( + ... method="linear", limit_direction="forward", axis=0 + ... ) a b c d 0 0.0 NaN -1.0 1.0 1 1.0 2.0 -2.0 5.0 @@ -8318,7 +8476,7 @@ def interpolate( Using polynomial interpolation. - >>> df['d'].interpolate(method='polynomial', order=2) + >>> df["d"].interpolate(method="polynomial", order=2) 0 1.0 1 4.0 2 9.0 @@ -8537,24 +8695,38 @@ def asof(self, where, subset=None): Take all columns into consideration - >>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.], - ... 'b': [None, None, None, None, 500]}, - ... index=pd.DatetimeIndex(['2018-02-27 09:01:00', - ... '2018-02-27 09:02:00', - ... '2018-02-27 09:03:00', - ... '2018-02-27 09:04:00', - ... '2018-02-27 09:05:00'])) - >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', - ... '2018-02-27 09:04:30'])) + >>> df = pd.DataFrame( + ... { + ... "a": [10.0, 20.0, 30.0, 40.0, 50.0], + ... "b": [None, None, None, None, 500], + ... }, + ... index=pd.DatetimeIndex( + ... [ + ... "2018-02-27 09:01:00", + ... "2018-02-27 09:02:00", + ... "2018-02-27 09:03:00", + ... "2018-02-27 09:04:00", + ... "2018-02-27 09:05:00", + ... ] + ... ), + ... ) + >>> df.asof( + ... pd.DatetimeIndex( + ... ["2018-02-27 09:03:30", "2018-02-27 09:04:30"] + ... ) + ... ) a b 2018-02-27 09:03:30 NaN NaN 2018-02-27 09:04:30 NaN NaN Take a single column into consideration - >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', - ... '2018-02-27 09:04:30']), - ... subset=['a']) + >>> df.asof( + ... pd.DatetimeIndex( + ... ["2018-02-27 09:03:30", "2018-02-27 09:04:30"] + ... ), + ... subset=["a"], + ... ) a b 2018-02-27 09:03:30 30.0 NaN 2018-02-27 09:04:30 40.0 NaN @@ -8665,11 +8837,18 @@ def isna(self) -> Self: -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], - ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... name=['Alfred', 'Batman', ''], - ... toy=[None, 'Batmobile', 'Joker'])) + >>> df = pd.DataFrame( + ... dict( + ... age=[5, 6, np.nan], + ... born=[ + ... pd.NaT, + ... pd.Timestamp("1939-05-27"), + ... pd.Timestamp("1940-04-25"), + ... ], + ... name=["Alfred", "Batman", ""], + ... toy=[None, "Batmobile", "Joker"], + ... ) + ... ) >>> df age born name toy 0 5.0 NaT Alfred None @@ -8732,11 +8911,18 @@ def notna(self) -> Self: -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], - ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... name=['Alfred', 'Batman', ''], - ... toy=[None, 'Batmobile', 'Joker'])) + >>> df = pd.DataFrame( + ... dict( + ... age=[5, 6, np.nan], + ... born=[ + ... pd.NaT, + ... pd.Timestamp("1939-05-27"), + ... pd.Timestamp("1940-04-25"), + ... ], + ... name=["Alfred", "Batman", ""], + ... toy=[None, "Batmobile", "Joker"], + ... ) + ... ) >>> df age born name toy 0 5.0 NaT Alfred None @@ -8782,15 +8968,11 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if lower is not None: cond = mask | (self >= lower) - result = result.where( - cond, lower, inplace=inplace - ) # type: ignore[assignment] + result = result.where(cond, lower, inplace=inplace) # type: ignore[assignment] if upper is not None: cond = mask | (self <= upper) result = self if inplace else result - result = result.where( - cond, upper, inplace=inplace - ) # type: ignore[assignment] + result = result.where(cond, upper, inplace=inplace) # type: ignore[assignment] return result @@ -8913,7 +9095,10 @@ def clip( Examples -------- - >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} + >>> data = { + ... "col_0": [9, -3, 0, -1, 5], + ... "col_1": [-2, -7, 6, 8, -5], + ... } >>> df = pd.DataFrame(data) >>> df col_0 col_1 @@ -9126,9 +9311,9 @@ def asfreq( -------- Start by creating a series with 4 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=4, freq='min') + >>> index = pd.date_range("1/1/2000", periods=4, freq="min") >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) - >>> df = pd.DataFrame({{'s': series}}) + >>> df = pd.DataFrame({{"s": series}}) >>> df s 2000-01-01 00:00:00 0.0 @@ -9138,7 +9323,7 @@ def asfreq( Upsample the series into 30 second bins. - >>> df.asfreq(freq='30s') + >>> df.asfreq(freq="30s") s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -9150,7 +9335,7 @@ def asfreq( Upsample again, providing a ``fill value``. - >>> df.asfreq(freq='30s', fill_value=9.0) + >>> df.asfreq(freq="30s", fill_value=9.0) s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 9.0 @@ -9162,7 +9347,7 @@ def asfreq( Upsample again, providing a ``method``. - >>> df.asfreq(freq='30s', method='bfill') + >>> df.asfreq(freq="30s", method="bfill") s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -9214,8 +9399,8 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='12h') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> i = pd.date_range("2018-04-09", periods=4, freq="12h") + >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 00:00:00 1 @@ -9223,7 +9408,7 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: 2018-04-10 00:00:00 3 2018-04-10 12:00:00 4 - >>> ts.at_time('12:00') + >>> ts.at_time("12:00") A 2018-04-09 12:00:00 2 2018-04-10 12:00:00 4 @@ -9286,8 +9471,8 @@ def between_time( Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> i = pd.date_range("2018-04-09", periods=4, freq="1D20min") + >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 00:00:00 1 @@ -9295,7 +9480,7 @@ def between_time( 2018-04-11 00:40:00 3 2018-04-12 01:00:00 4 - >>> ts.between_time('0:15', '0:45') + >>> ts.between_time("0:15", "0:45") A 2018-04-10 00:20:00 2 2018-04-11 00:40:00 3 @@ -9303,7 +9488,7 @@ def between_time( You get the times that are *not* between two times by setting ``start_time`` later than ``end_time``: - >>> ts.between_time('0:45', '0:15') + >>> ts.between_time("0:45", "0:15") A 2018-04-09 00:00:00 1 2018-04-12 01:00:00 4 @@ -9448,7 +9633,7 @@ def resample( -------- Start by creating a series with 9 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=9, freq='min') + >>> index = pd.date_range("1/1/2000", periods=9, freq="min") >>> series = pd.Series(range(9), index=index) >>> series 2000-01-01 00:00:00 0 @@ -9465,7 +9650,7 @@ def resample( Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> series.resample('3min').sum() + >>> series.resample("3min").sum() 2000-01-01 00:00:00 3 2000-01-01 00:03:00 12 2000-01-01 00:06:00 21 @@ -9479,7 +9664,7 @@ def resample( value in the resampled bucket with the label ``2000-01-01 00:03:00`` does not include 3 (if it did, the summed value would be 6, not 3). - >>> series.resample('3min', label='right').sum() + >>> series.resample("3min", label="right").sum() 2000-01-01 00:03:00 3 2000-01-01 00:06:00 12 2000-01-01 00:09:00 21 @@ -9488,7 +9673,7 @@ def resample( To include this value close the right side of the bin interval, as shown below. - >>> series.resample('3min', label='right', closed='right').sum() + >>> series.resample("3min", label="right", closed="right").sum() 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 2000-01-01 00:06:00 15 @@ -9497,7 +9682,7 @@ def resample( Upsample the series into 30 second bins. - >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows + >>> series.resample("30s").asfreq()[0:5] # Select first 5 rows 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 @@ -9508,7 +9693,7 @@ def resample( Upsample the series into 30 second bins and fill the ``NaN`` values using the ``ffill`` method. - >>> series.resample('30s').ffill()[0:5] + >>> series.resample("30s").ffill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 0 2000-01-01 00:01:00 1 @@ -9519,7 +9704,7 @@ def resample( Upsample the series into 30 second bins and fill the ``NaN`` values using the ``bfill`` method. - >>> series.resample('30s').bfill()[0:5] + >>> series.resample("30s").bfill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 1 2000-01-01 00:01:00 1 @@ -9531,8 +9716,7 @@ def resample( >>> def custom_resampler(arraylike): ... return np.sum(arraylike) + 5 - ... - >>> series.resample('3min').apply(custom_resampler) + >>> series.resample("3min").apply(custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 2000-01-01 00:06:00 26 @@ -9541,12 +9725,16 @@ def resample( For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. - >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}} + >>> d = { + ... { + ... "price": [10, 11, 9, 13, 14, 18, 17, 19], + ... "volume": [50, 60, 40, 100, 50, 100, 40, 50], + ... } + ... } >>> df = pd.DataFrame(d) - >>> df['week_starting'] = pd.date_range('01/01/2018', - ... periods=8, - ... freq='W') + >>> df["week_starting"] = pd.date_range( + ... "01/01/2018", periods=8, freq="W" + ... ) >>> df price volume week_starting 0 10 50 2018-01-07 @@ -9557,7 +9745,7 @@ def resample( 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('ME', on='week_starting').mean() + >>> df.resample("ME", on="week_starting").mean() price volume week_starting 2018-01-31 10.75 62.5 @@ -9566,14 +9754,18 @@ def resample( For a DataFrame with MultiIndex, the keyword `level` can be used to specify on which level the resampling needs to take place. - >>> days = pd.date_range('1/1/2000', periods=4, freq='D') - >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}} + >>> days = pd.date_range("1/1/2000", periods=4, freq="D") + >>> d2 = { + ... { + ... "price": [10, 11, 9, 13, 14, 18, 17, 19], + ... "volume": [50, 60, 40, 100, 50, 100, 40, 50], + ... } + ... } >>> df2 = pd.DataFrame( ... d2, ... index=pd.MultiIndex.from_product( - ... [days, ['morning', 'afternoon']] - ... ) + ... [days, ["morning", "afternoon"]] + ... ), ... ) >>> df2 price volume @@ -9585,7 +9777,7 @@ def resample( afternoon 18 100 2000-01-04 morning 17 40 afternoon 19 50 - >>> df2.resample('D', level=0).sum() + >>> df2.resample("D", level=0).sum() price volume 2000-01-01 21 110 2000-01-02 22 140 @@ -9594,8 +9786,8 @@ def resample( If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - >>> rng = pd.date_range(start, end, freq='7min') + >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + >>> rng = pd.date_range(start, end, freq="7min") >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts 2000-10-01 23:30:00 0 @@ -9609,7 +9801,7 @@ def resample( 2000-10-02 00:26:00 24 Freq: 7min, dtype: int64 - >>> ts.resample('17min').sum() + >>> ts.resample("17min").sum() 2000-10-01 23:14:00 0 2000-10-01 23:31:00 9 2000-10-01 23:48:00 21 @@ -9617,7 +9809,7 @@ def resample( 2000-10-02 00:22:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17min', origin='epoch').sum() + >>> ts.resample("17min", origin="epoch").sum() 2000-10-01 23:18:00 0 2000-10-01 23:35:00 18 2000-10-01 23:52:00 27 @@ -9625,7 +9817,7 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17min', origin='2000-01-01').sum() + >>> ts.resample("17min", origin="2000-01-01").sum() 2000-10-01 23:24:00 3 2000-10-01 23:41:00 15 2000-10-01 23:58:00 45 @@ -9635,14 +9827,14 @@ def resample( If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: - >>> ts.resample('17min', origin='start').sum() + >>> ts.resample("17min", origin="start").sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17min', offset='23h30min').sum() + >>> ts.resample("17min", offset="23h30min").sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 @@ -9651,7 +9843,7 @@ def resample( If you want to take the largest Timestamp as the end of the bins: - >>> ts.resample('17min', origin='end').sum() + >>> ts.resample("17min", origin="end").sum() 2000-10-01 23:35:00 0 2000-10-01 23:52:00 18 2000-10-02 00:09:00 27 @@ -9662,7 +9854,7 @@ def resample( midnight of the largest Timestamp as the end of the bins and drop the bins not containing data: - >>> ts.resample('17min', origin='end_day').sum() + >>> ts.resample("17min", origin="end_day").sum() 2000-10-01 23:38:00 3 2000-10-01 23:55:00 15 2000-10-02 00:12:00 45 @@ -9765,8 +9957,8 @@ def first(self, offset) -> Self: Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> i = pd.date_range("2018-04-09", periods=4, freq="2D") + >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 1 @@ -9776,7 +9968,7 @@ def first(self, offset) -> Self: Get the rows for the first 3 days: - >>> ts.first('3D') + >>> ts.first("3D") A 2018-04-09 1 2018-04-11 2 @@ -9853,8 +10045,8 @@ def last(self, offset) -> Self: Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> i = pd.date_range("2018-04-09", periods=4, freq="2D") + >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 1 @@ -9864,7 +10056,7 @@ def last(self, offset) -> Self: Get the rows for the last 3 days: - >>> ts.last('3D') # doctest: +SKIP + >>> ts.last("3D") # doctest: +SKIP A 2018-04-13 3 2018-04-15 4 @@ -9953,9 +10145,18 @@ def rank( Examples -------- - >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog', - ... 'spider', 'snake'], - ... 'Number_legs': [4, 2, 4, 8, np.nan]}) + >>> df = pd.DataFrame( + ... data={ + ... "Animal": [ + ... "cat", + ... "penguin", + ... "dog", + ... "spider", + ... "snake", + ... ], + ... "Number_legs": [4, 2, 4, 8, np.nan], + ... } + ... ) >>> df Animal Number_legs 0 cat 4.0 @@ -9989,10 +10190,10 @@ def rank( * pct_rank: when setting ``pct = True``, the ranking is expressed as percentile rank. - >>> df['default_rank'] = df['Number_legs'].rank() - >>> df['max_rank'] = df['Number_legs'].rank(method='max') - >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom') - >>> df['pct_rank'] = df['Number_legs'].rank(pct=True) + >>> df["default_rank"] = df["Number_legs"].rank() + >>> df["max_rank"] = df["Number_legs"].rank(method="max") + >>> df["NA_bottom"] = df["Number_legs"].rank(na_option="bottom") + >>> df["pct_rank"] = df["Number_legs"].rank(pct=True) >>> df Animal Number_legs default_rank max_rank NA_bottom pct_rank 0 cat 4.0 2.5 3.0 2.5 0.625 @@ -10223,10 +10424,16 @@ def align( Examples -------- >>> df = pd.DataFrame( - ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2] + ... [[1, 2, 3, 4], [6, 7, 8, 9]], + ... columns=["D", "B", "E", "A"], + ... index=[1, 2], ... ) >>> other = pd.DataFrame( - ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]], + ... [ + ... [10, 20, 30, 40], + ... [60, 70, 80, 90], + ... [600, 700, 800, 900], + ... ], ... columns=["A", "B", "C", "D"], ... index=[2, 3, 4], ... ) @@ -10869,7 +11076,9 @@ def where( 4 10 dtype: int64 - >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) + >>> df = pd.DataFrame( + ... np.arange(10).reshape(-1, 2), columns=["A", "B"] + ... ) >>> df A B 0 0 1 @@ -11085,10 +11294,16 @@ def shift( Examples -------- - >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45], - ... "Col2": [13, 23, 18, 33, 48], - ... "Col3": [17, 27, 22, 37, 52]}}, - ... index=pd.date_range("2020-01-01", "2020-01-05")) + >>> df = pd.DataFrame( + ... { + ... { + ... "Col1": [10, 20, 15, 30, 45], + ... "Col2": [13, 23, 18, 33, 48], + ... "Col3": [17, 27, 22, 37, 52], + ... } + ... }, + ... index=pd.date_range("2020-01-01", "2020-01-05"), + ... ) >>> df Col1 Col2 Col3 2020-01-01 10 13 17 @@ -11137,7 +11352,7 @@ def shift( 2020-01-07 30 33 37 2020-01-08 45 48 52 - >>> df['Col1'].shift(periods=[0, 1, 2]) + >>> df["Col1"].shift(periods=[0, 1, 2]) Col1_0 Col1_1 Col1_2 2020-01-01 10 NaN NaN 2020-01-02 20 10.0 NaN @@ -11270,10 +11485,14 @@ def truncate( Examples -------- - >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'], - ... 'B': ['f', 'g', 'h', 'i', 'j'], - ... 'C': ['k', 'l', 'm', 'n', 'o']}, - ... index=[1, 2, 3, 4, 5]) + >>> df = pd.DataFrame( + ... { + ... "A": ["a", "b", "c", "d", "e"], + ... "B": ["f", "g", "h", "i", "j"], + ... "C": ["k", "l", "m", "n", "o"], + ... }, + ... index=[1, 2, 3, 4, 5], + ... ) >>> df A B C 1 a f k @@ -11300,7 +11519,7 @@ def truncate( For Series, only rows can be truncated. - >>> df['A'].truncate(before=2, after=4) + >>> df["A"].truncate(before=2, after=4) 2 b 3 c 4 d @@ -11309,8 +11528,8 @@ def truncate( The index values in ``truncate`` can be datetimes or string dates. - >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s') - >>> df = pd.DataFrame(index=dates, data={'A': 1}) + >>> dates = pd.date_range("2016-01-01", "2016-02-01", freq="s") + >>> df = pd.DataFrame(index=dates, data={"A": 1}) >>> df.tail() A 2016-01-31 23:59:56 1 @@ -11319,8 +11538,10 @@ def truncate( 2016-01-31 23:59:59 1 2016-02-01 00:00:00 1 - >>> df.truncate(before=pd.Timestamp('2016-01-05'), - ... after=pd.Timestamp('2016-01-10')).tail() + >>> df.truncate( + ... before=pd.Timestamp("2016-01-05"), + ... after=pd.Timestamp("2016-01-10"), + ... ).tail() A 2016-01-09 23:59:56 1 2016-01-09 23:59:57 1 @@ -11332,7 +11553,7 @@ def truncate( specify `before` and `after` as strings. They will be coerced to Timestamps before truncation. - >>> df.truncate('2016-01-05', '2016-01-10').tail() + >>> df.truncate("2016-01-05", "2016-01-10").tail() A 2016-01-09 23:59:56 1 2016-01-09 23:59:57 1 @@ -11344,7 +11565,7 @@ def truncate( component (midnight). This differs from partial string slicing, which returns any partially matching dates. - >>> df.loc['2016-01-05':'2016-01-10', :].tail() + >>> df.loc["2016-01-05":"2016-01-10", :].tail() A 2016-01-10 23:59:55 1 2016-01-10 23:59:56 1 @@ -11436,16 +11657,18 @@ def tz_convert( >>> s = pd.Series( ... [1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']), + ... index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]), ... ) - >>> s.tz_convert('Asia/Shanghai') + >>> s.tz_convert("Asia/Shanghai") 2018-09-15 07:30:00+08:00 1 dtype: int64 Pass None to convert to UTC and get a tz-naive index: - >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + >>> s = pd.Series( + ... [1], + ... index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]), + ... ) >>> s.tz_convert(None) 2018-09-14 23:30:00 1 dtype: int64 @@ -11566,16 +11789,18 @@ def tz_localize( >>> s = pd.Series( ... [1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']), + ... index=pd.DatetimeIndex(["2018-09-15 01:30:00"]), ... ) - >>> s.tz_localize('CET') + >>> s.tz_localize("CET") 2018-09-15 01:30:00+02:00 1 dtype: int64 Pass None to convert to tz-naive index and preserve local time: - >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + >>> s = pd.Series( + ... [1], + ... index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]), + ... ) >>> s.tz_localize(None) 2018-09-15 01:30:00 1 dtype: int64 @@ -11583,15 +11808,21 @@ def tz_localize( Be careful with DST changes. When there is sequential data, pandas can infer the DST time: - >>> s = pd.Series(range(7), - ... index=pd.DatetimeIndex(['2018-10-28 01:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 03:00:00', - ... '2018-10-28 03:30:00'])) - >>> s.tz_localize('CET', ambiguous='infer') + >>> s = pd.Series( + ... range(7), + ... index=pd.DatetimeIndex( + ... [ + ... "2018-10-28 01:30:00", + ... "2018-10-28 02:00:00", + ... "2018-10-28 02:30:00", + ... "2018-10-28 02:00:00", + ... "2018-10-28 02:30:00", + ... "2018-10-28 03:00:00", + ... "2018-10-28 03:30:00", + ... ] + ... ), + ... ) + >>> s.tz_localize("CET", ambiguous="infer") 2018-10-28 01:30:00+02:00 0 2018-10-28 02:00:00+02:00 1 2018-10-28 02:30:00+02:00 2 @@ -11604,11 +11835,19 @@ def tz_localize( In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly - >>> s = pd.Series(range(3), - ... index=pd.DatetimeIndex(['2018-10-28 01:20:00', - ... '2018-10-28 02:36:00', - ... '2018-10-28 03:46:00'])) - >>> s.tz_localize('CET', ambiguous=np.array([True, True, False])) + >>> s = pd.Series( + ... range(3), + ... index=pd.DatetimeIndex( + ... [ + ... "2018-10-28 01:20:00", + ... "2018-10-28 02:36:00", + ... "2018-10-28 03:46:00", + ... ] + ... ), + ... ) + >>> s.tz_localize( + ... "CET", ambiguous=np.array([True, True, False]) + ... ) 2018-10-28 01:20:00+02:00 0 2018-10-28 02:36:00+02:00 1 2018-10-28 03:46:00+01:00 2 @@ -11618,18 +11857,23 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series(range(2), - ... index=pd.DatetimeIndex(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) - >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward') + >>> s = pd.Series( + ... range(2), + ... index=pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"] + ... ), + ... ) + >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward') + >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_backward") 2015-03-29 01:59:59.999999999+01:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) + >>> s.tz_localize( + ... "Europe/Warsaw", nonexistent=pd.Timedelta("1h") + ... ) 2015-03-29 03:30:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 @@ -11790,7 +12034,7 @@ def describe( Describing a categorical ``Series``. - >>> s = pd.Series(['a', 'a', 'b', 'c']) + >>> s = pd.Series(["a", "a", "b", "c"]) >>> s.describe() count 4 unique 3 @@ -11800,11 +12044,13 @@ def describe( Describing a timestamp ``Series``. - >>> s = pd.Series([ - ... np.datetime64("2000-01-01"), - ... np.datetime64("2010-01-01"), - ... np.datetime64("2010-01-01") - ... ]) + >>> s = pd.Series( + ... [ + ... np.datetime64("2000-01-01"), + ... np.datetime64("2010-01-01"), + ... np.datetime64("2010-01-01"), + ... ] + ... ) >>> s.describe() count 3 mean 2006-09-01 08:00:00 @@ -11818,10 +12064,13 @@ def describe( Describing a ``DataFrame``. By default only numeric fields are returned. - >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), - ... 'numeric': [1, 2, 3], - ... 'object': ['a', 'b', 'c'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "categorical": pd.Categorical(["d", "e", "f"]), + ... "numeric": [1, 2, 3], + ... "object": ["a", "b", "c"], + ... } + ... ) >>> df.describe() numeric count 3.0 @@ -11835,7 +12084,7 @@ def describe( Describing all columns of a ``DataFrame`` regardless of data type. - >>> df.describe(include='all') # doctest: +SKIP + >>> df.describe(include="all") # doctest: +SKIP categorical numeric object count 3 3.0 3 unique 3 NaN 3 @@ -11887,7 +12136,7 @@ def describe( Including only categorical columns from a ``DataFrame`` description. - >>> df.describe(include=['category']) + >>> df.describe(include=["category"]) categorical count 3 unique 3 @@ -12028,11 +12277,14 @@ def pct_change( Percentage change in French franc, Deutsche Mark, and Italian lira from 1980-01-01 to 1980-03-01. - >>> df = pd.DataFrame({ - ... 'FR': [4.0405, 4.0963, 4.3149], - ... 'GR': [1.7246, 1.7482, 1.8519], - ... 'IT': [804.74, 810.01, 860.13]}, - ... index=['1980-01-01', '1980-02-01', '1980-03-01']) + >>> df = pd.DataFrame( + ... { + ... "FR": [4.0405, 4.0963, 4.3149], + ... "GR": [1.7246, 1.7482, 1.8519], + ... "IT": [804.74, 810.01, 860.13], + ... }, + ... index=["1980-01-01", "1980-02-01", "1980-03-01"], + ... ) >>> df FR GR IT 1980-01-01 4.0405 1.7246 804.74 @@ -12048,17 +12300,20 @@ def pct_change( Percentage of change in GOOG and APPL stock volume. Shows computing the percentage change between columns. - >>> df = pd.DataFrame({ - ... '2016': [1769950, 30586265], - ... '2015': [1500923, 40912316], - ... '2014': [1371819, 41403351]}, - ... index=['GOOG', 'APPL']) + >>> df = pd.DataFrame( + ... { + ... "2016": [1769950, 30586265], + ... "2015": [1500923, 40912316], + ... "2014": [1371819, 41403351], + ... }, + ... index=["GOOG", "APPL"], + ... ) >>> df 2016 2015 2014 GOOG 1769950 1500923 1371819 APPL 30586265 40912316 41403351 - >>> df.pct_change(axis='columns', periods=-1) + >>> df.pct_change(axis="columns", periods=-1) 2016 2015 2014 GOOG 0.179241 0.094112 NaN APPL -0.252395 -0.011860 NaN @@ -12199,7 +12454,12 @@ def _accum_func( if axis == 1: return self.T._accum_func( - name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026 + name, + func, + axis=0, + skipna=skipna, + *args, # noqa: B026 + **kwargs, ).T def block_accum_func(blk_values): @@ -12677,14 +12937,16 @@ def __imul__(self, other) -> Self: def __itruediv__(self, other) -> Self: # error: Unsupported left operand type for / ("Type[NDFrame]") return self._inplace_method( - other, type(self).__truediv__ # type: ignore[operator] + other, + type(self).__truediv__, # type: ignore[operator] ) @final def __ifloordiv__(self, other) -> Self: # error: Unsupported left operand type for // ("Type[NDFrame]") return self._inplace_method( - other, type(self).__floordiv__ # type: ignore[operator] + other, + type(self).__floordiv__, # type: ignore[operator] ) @final @@ -12772,7 +13034,9 @@ def first_valid_index(self) -> Hashable | None: For DataFrame: - >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}}) + >>> df = pd.DataFrame( + ... {{"A": [None, None, 2], "B": [None, 3, 4]}} + ... ) >>> df A B 0 NaN NaN @@ -12783,7 +13047,9 @@ def first_valid_index(self) -> Hashable | None: >>> df.last_valid_index() 2 - >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}}) + >>> df = pd.DataFrame( + ... {{"A": [None, None, None], "B": [None, None, None]}} + ... ) >>> df A B 0 None None @@ -13452,9 +13718,7 @@ def last_valid_index(self) -> Hashable | None: Series([], dtype: bool) """ -_shared_docs[ - "stat_func_example" -] = """ +_shared_docs["stat_func_example"] = """ Examples -------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 17035a35b5a60..8a79e61ad7dfd 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -137,7 +137,9 @@ class NamedAgg(NamedTuple): Examples -------- - >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) + >>> df = pd.DataFrame( + ... {"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]} + ... ) >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x)) >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) @@ -597,12 +599,15 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): Examples -------- - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : [1, 2, 3, 4, 5, 6], - ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A') - >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "bar", "foo", "bar", "foo", "bar"], + ... "B": [1, 2, 3, 4, 5, 6], + ... "C": [2.0, 5.0, 8.0, 1.0, 2.0, 9.0], + ... } + ... ) + >>> grouped = df.groupby("A") + >>> df.groupby("A").B.filter(lambda x: x.mean() > 3.0) 1 2 3 4 5 6 @@ -645,7 +650,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: -------- For SeriesGroupby: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 3], index=lst) >>> ser a 1 @@ -660,15 +665,24 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: For Resampler: - >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 3], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 3 dtype: int64 - >>> ser.resample('MS').nunique() + >>> ser.resample("MS").nunique() 2023-01-01 2 2023-02-01 1 Freq: MS, dtype: int64 @@ -848,7 +862,10 @@ def value_counts( # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray, # ndarray[Any, Any]], Index, Series]] _, idx = get_join_indexers( - left, right, sort=False, how="left" # type: ignore[arg-type] + left, + right, + sort=False, + how="left", # type: ignore[arg-type] ) if idx is not None: out = np.where(idx != -1, out[idx], 0) @@ -937,7 +954,7 @@ def fillna( -------- For SeriesGroupBy: - >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse'] + >>> lst = ["cat", "cat", "cat", "mouse", "mouse"] >>> ser = pd.Series([1, None, None, 2, None], index=lst) >>> ser cat 1.0 @@ -1022,13 +1039,17 @@ def take( Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan), - ... ('rabbit', 'mammal', 15.0)], - ... columns=['name', 'class', 'max_speed'], - ... index=[4, 3, 2, 1, 0]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ("rabbit", "mammal", 15.0), + ... ], + ... columns=["name", "class", "max_speed"], + ... index=[4, 3, 2, 1, 0], + ... ) >>> df name class max_speed 4 falcon bird 389.0 @@ -1101,10 +1122,19 @@ def skew( Examples -------- - >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], - ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', - ... 'Parrot', 'Parrot', 'Parrot'], - ... name="Max Speed") + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, np.nan, 22.0, 20.0, 30.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) >>> ser Falcon 390.0 Falcon 350.0 @@ -1217,7 +1247,10 @@ def is_monotonic_increasing(self) -> Series: Examples -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s = pd.Series( + ... [2, 1, 3, 4], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... ) >>> s.groupby(level=0).is_monotonic_increasing Falcon False Parrot True @@ -1236,7 +1269,10 @@ def is_monotonic_decreasing(self) -> Series: Examples -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s = pd.Series( + ... [2, 1, 3, 4], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... ) >>> s.groupby(level=0).is_monotonic_decreasing Falcon True Parrot False @@ -1300,13 +1336,17 @@ def unique(self) -> Series: Examples -------- - >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), - ... ('Beagle', 'dog', 15.2), - ... ('Chihuahua', 'dog', 6.9), - ... ('Persian', 'cat', 9.2), - ... ('Chihuahua', 'dog', 7), - ... ('Persian', 'cat', 8.8)], - ... columns=['breed', 'animal', 'height_in']) + >>> df = pd.DataFrame( + ... [ + ... ("Chihuahua", "dog", 6.1), + ... ("Beagle", "dog", 15.2), + ... ("Chihuahua", "dog", 6.9), + ... ("Persian", "cat", 9.2), + ... ("Chihuahua", "dog", 7), + ... ("Persian", "cat", 8.8), + ... ], + ... columns=["breed", "animal", "height_in"], + ... ) >>> df breed animal height_in 0 Chihuahua dog 6.1 @@ -1315,7 +1355,7 @@ def unique(self) -> Series: 3 Persian cat 9.2 4 Chihuahua dog 7.0 5 Persian cat 8.8 - >>> ser = df.groupby('animal')['breed'].unique() + >>> ser = df.groupby("animal")["breed"].unique() >>> ser animal cat [Persian] @@ -1895,12 +1935,15 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): Examples -------- - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : [1, 2, 3, 4, 5, 6], - ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A') - >>> grouped.filter(lambda x: x['B'].mean() > 3.) + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "bar", "foo", "bar", "foo", "bar"], + ... "B": [1, 2, 3, 4, 5, 6], + ... "C": [2.0, 5.0, 8.0, 1.0, 2.0, 9.0], + ... } + ... ) + >>> grouped = df.groupby("A") + >>> grouped.filter(lambda x: x["B"].mean() > 3.0) A B C 1 bar 2 5.0 3 bar 4 1.0 @@ -2058,10 +2101,13 @@ def nunique(self, dropna: bool = True) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', - ... 'ham', 'ham'], - ... 'value1': [1, 5, 5, 2, 5, 5], - ... 'value2': list('abbaxy')}) + >>> df = pd.DataFrame( + ... { + ... "id": ["spam", "egg", "egg", "spam", "ham", "ham"], + ... "value1": [1, 5, 5, 2, 5, 5], + ... "value2": list("abbaxy"), + ... } + ... ) >>> df id value1 value2 0 spam 1 a @@ -2071,7 +2117,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: 4 ham 5 x 5 ham 5 y - >>> df.groupby('id').nunique() + >>> df.groupby("id").nunique() value1 value2 id egg 1 1 @@ -2080,7 +2126,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: Check for rows with the same id but conflicting values: - >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + >>> df.groupby("id").filter(lambda g: (g.nunique() > 1).any()) id value1 value2 0 spam 1 a 3 spam 2 a @@ -2149,9 +2195,13 @@ def idxmax( -------- Consider a dataset containing food consumption in Argentina. - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + >>> df = pd.DataFrame( + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) >>> df consumption co2_emissions @@ -2231,9 +2281,13 @@ def idxmin( -------- Consider a dataset containing food consumption in Argentina. - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + >>> df = pd.DataFrame( + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) >>> df consumption co2_emissions @@ -2315,11 +2369,27 @@ def value_counts( Examples -------- - >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "gender": [ + ... "male", + ... "male", + ... "female", + ... "male", + ... "female", + ... "male", + ... ], + ... "education": [ + ... "low", + ... "medium", + ... "high", + ... "low", + ... "high", + ... "low", + ... ], + ... "country": ["US", "FR", "US", "FR", "FR", "FR"], + ... } + ... ) >>> df gender education country @@ -2330,7 +2400,7 @@ def value_counts( 4 female high FR 5 male low FR - >>> df.groupby('gender').value_counts() + >>> df.groupby("gender").value_counts() gender education country female high FR 1 US 1 @@ -2339,7 +2409,7 @@ def value_counts( medium FR 1 Name: count, dtype: int64 - >>> df.groupby('gender').value_counts(ascending=True) + >>> df.groupby("gender").value_counts(ascending=True) gender education country female high FR 1 US 1 @@ -2348,7 +2418,7 @@ def value_counts( low FR 2 Name: count, dtype: int64 - >>> df.groupby('gender').value_counts(normalize=True) + >>> df.groupby("gender").value_counts(normalize=True) gender education country female high FR 0.50 US 0.50 @@ -2357,7 +2427,7 @@ def value_counts( medium FR 0.25 Name: proportion, dtype: float64 - >>> df.groupby('gender', as_index=False).value_counts() + >>> df.groupby("gender", as_index=False).value_counts() gender education country count 0 female high FR 1 1 female high US 1 @@ -2365,7 +2435,9 @@ def value_counts( 3 male low US 1 4 male medium FR 1 - >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + >>> df.groupby("gender", as_index=False).value_counts( + ... normalize=True + ... ) gender education country proportion 0 female high FR 0.50 1 female high US 0.50 @@ -2475,7 +2547,9 @@ def fillna( Propagate non-null values forward or backward within each group along rows. - >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T + >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna( + ... method="ffill" + ... ).T key A B C 0 0.0 0.0 2.0 2.0 1 0.0 2.0 3.0 3.0 @@ -2483,7 +2557,9 @@ def fillna( 3 1.0 3.0 NaN NaN 4 1.0 1.0 NaN NaN - >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T + >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna( + ... method="bfill" + ... ).T key A B C 0 0.0 NaN 2.0 NaN 1 0.0 2.0 3.0 NaN @@ -2568,13 +2644,17 @@ def take( Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan), - ... ('rabbit', 'mammal', 15.0)], - ... columns=['name', 'class', 'max_speed'], - ... index=[4, 3, 2, 1, 0]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ("rabbit", "mammal", 15.0), + ... ], + ... columns=["name", "class", "max_speed"], + ... index=[4, 3, 2, 1, 0], + ... ) >>> df name class max_speed 4 falcon bird 389.0 @@ -2666,14 +2746,43 @@ def skew( Examples -------- - >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', - ... 'lion', 'monkey', 'rabbit'], - ... ['bird', 'bird', 'bird', 'bird', - ... 'mammal', 'mammal', 'mammal']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) - >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, - ... 80.5, 21.5, 15.0]}, - ... index=index) + >>> arrays = [ + ... [ + ... "falcon", + ... "parrot", + ... "cockatoo", + ... "kiwi", + ... "lion", + ... "monkey", + ... "rabbit", + ... ], + ... [ + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "mammal", + ... "mammal", + ... "mammal", + ... ], + ... ] + >>> index = pd.MultiIndex.from_arrays( + ... arrays, names=("name", "class") + ... ) + >>> df = pd.DataFrame( + ... { + ... "max_speed": [ + ... 389.0, + ... 24.0, + ... 70.0, + ... np.nan, + ... 80.5, + ... 21.5, + ... 15.0, + ... ] + ... }, + ... index=index, + ... ) >>> df max_speed name class diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b46acef08e9ea..0f1a8a29f7881 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -802,7 +802,7 @@ def groups(self) -> dict[Hashable, np.ndarray]: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -826,15 +826,24 @@ def groups(self) -> dict[Hashable, np.ndarray]: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').groups + >>> ser.resample("MS").groups {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} """ return self.grouper.groups @@ -855,7 +864,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -868,8 +877,11 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"], + ... ) >>> df a b c owl 1 2 3 @@ -880,15 +892,24 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').indices + >>> ser.resample("MS").indices defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], Timestamp('2023-02-01 00:00:00'): [2, 3]}) """ @@ -1036,7 +1057,7 @@ def get_group(self, name, obj=None) -> DataFrame | Series: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -1051,8 +1072,11 @@ def get_group(self, name, obj=None) -> DataFrame | Series: For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"], + ... ) >>> df a b c owl 1 2 3 @@ -1065,15 +1089,24 @@ def get_group(self, name, obj=None) -> DataFrame | Series: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').get_group('2023-01-01') + >>> ser.resample("MS").get_group("2023-01-01") 2023-01-01 1 2023-01-15 2 dtype: int64 @@ -1130,7 +1163,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -1138,7 +1171,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: b 3 dtype: int64 >>> for x, y in ser.groupby(level=0): - ... print(f'{x}\\n{y}\\n') + ... print(f"{x}\\n{y}\\n") a a 1 a 2 @@ -1157,7 +1190,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: 1 1 5 6 2 7 8 9 >>> for x, y in df.groupby(by=["a"]): - ... print(f'{x}\\n{y}\\n') + ... print(f"{x}\\n{y}\\n") (1,) a b c 0 1 2 3 @@ -1168,16 +1201,25 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> for x, y in ser.resample('MS'): - ... print(f'{x}\\n{y}\\n') + >>> for x, y in ser.resample("MS"): + ... print(f"{x}\\n{y}\\n") 2023-01-01 00:00:00 2023-01-01 1 2023-01-15 2 @@ -2159,7 +2201,7 @@ def any(self, skipna: bool = True) -> NDFrameT: -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 0], index=lst) >>> ser a 1 @@ -2174,8 +2216,11 @@ def any(self, skipna: bool = True) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["ostrich", "penguin", "parrot"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["ostrich", "penguin", "parrot"], + ... ) >>> df a b c ostrich 1 0 3 @@ -2216,7 +2261,7 @@ def all(self, skipna: bool = True) -> NDFrameT: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 0], index=lst) >>> ser a 1 @@ -2231,8 +2276,11 @@ def all(self, skipna: bool = True) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["ostrich", "penguin", "parrot"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["ostrich", "penguin", "parrot"], + ... ) >>> df a b c ostrich 1 0 3 @@ -2266,7 +2314,7 @@ def count(self) -> NDFrameT: -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, np.nan], index=lst) >>> ser a 1.0 @@ -2281,8 +2329,11 @@ def count(self) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"], + ... ) >>> df a b c cow 1 NaN 3 @@ -2296,15 +2347,24 @@ def count(self) -> NDFrameT: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').count() + >>> ser.resample("MS").count() 2023-01-01 2 2023-02-01 2 Freq: MS, dtype: int64 @@ -2395,14 +2455,19 @@ def mean( %(see_also)s Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], - ... 'B': [np.nan, 2, 3, 4, 5], - ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 1, 2, 1, 2], + ... "B": [np.nan, 2, 3, 4, 5], + ... "C": [1, 2, 1, 1, 2], + ... }, + ... columns=["A", "B", "C"], + ... ) Groupby one column and return the mean of the remaining columns in each group. - >>> df.groupby('A').mean() + >>> df.groupby("A").mean() B C A 1 3.0 1.333333 @@ -2410,7 +2475,7 @@ def mean( Groupby two columns and return the mean of the remaining column. - >>> df.groupby(['A', 'B']).mean() + >>> df.groupby(["A", "B"]).mean() C A B 1 2.0 2.0 @@ -2421,7 +2486,7 @@ def mean( Groupby one column and return the mean of only particular column in the group. - >>> df.groupby('A')['B'].mean() + >>> df.groupby("A")["B"].mean() A 1 3.0 2 4.0 @@ -2470,7 +2535,7 @@ def median(self, numeric_only: bool = False) -> NDFrameT: -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -2487,9 +2552,22 @@ def median(self, numeric_only: bool = False) -> NDFrameT: For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = { + ... "a": [1, 3, 5, 7, 7, 8, 3], + ... "b": [1, 4, 8, 4, 4, 2, 1], + ... } + >>> df = pd.DataFrame( + ... data, + ... index=[ + ... "dog", + ... "dog", + ... "dog", + ... "mouse", + ... "mouse", + ... "mouse", + ... "mouse", + ... ], + ... ) >>> df a b dog 1 1 @@ -2506,14 +2584,20 @@ def median(self, numeric_only: bool = False) -> NDFrameT: For Resampler: - >>> ser = pd.Series([1, 2, 3, 3, 4, 5], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').median() + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() 2023-01-01 2.0 2023-02-01 4.0 Freq: MS, dtype: float64 @@ -2580,7 +2664,7 @@ def std( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -2597,9 +2681,22 @@ def std( For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = { + ... "a": [1, 3, 5, 7, 7, 8, 3], + ... "b": [1, 4, 8, 4, 4, 2, 1], + ... } + >>> df = pd.DataFrame( + ... data, + ... index=[ + ... "dog", + ... "dog", + ... "dog", + ... "mouse", + ... "mouse", + ... "mouse", + ... "mouse", + ... ], + ... ) >>> df a b dog 1 1 @@ -2689,7 +2786,7 @@ def var( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -2706,9 +2803,22 @@ def var( For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = { + ... "a": [1, 3, 5, 7, 7, 8, 3], + ... "b": [1, 4, 8, 4, 4, 2, 1], + ... } + >>> df = pd.DataFrame( + ... data, + ... index=[ + ... "dog", + ... "dog", + ... "dog", + ... "mouse", + ... "mouse", + ... "mouse", + ... "mouse", + ... ], + ... ) >>> df a b dog 1 1 @@ -2915,7 +3025,7 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([5, 10, 8, 14], index=lst) >>> ser a 5 @@ -2931,8 +3041,11 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"], + ... ) >>> df a b c tuna 1 12 11 @@ -2947,14 +3060,20 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: For Resampler: - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').sem() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() 2023-01-01 0.577350 2023-02-01 1.527525 Freq: MS, dtype: float64 @@ -2989,7 +3108,7 @@ def size(self) -> DataFrame | Series: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -3002,8 +3121,11 @@ def size(self) -> DataFrame | Series: dtype: int64 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"], + ... ) >>> df a b c owl 1 2 3 @@ -3017,14 +3139,18 @@ def size(self) -> DataFrame | Series: For Resampler: - >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01'])) + >>> ser = pd.Series( + ... [1, 2, 3], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 dtype: int64 - >>> ser.resample('MS').size() + >>> ser.resample("MS").size() 2023-01-01 2 2023-02-01 1 Freq: MS, dtype: int64 @@ -3351,9 +3477,15 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Examples -------- - >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], - ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) - >>> df['D'] = pd.to_datetime(df['D']) + >>> df = pd.DataFrame( + ... dict( + ... A=[1, 1, 3], + ... B=[None, 5, 6], + ... C=[1, 2, 3], + ... D=["3/11/2000", "3/12/2000", "3/13/2000"], + ... ) + ... ) + >>> df["D"] = pd.to_datetime(df["D"]) >>> df.groupby("A").first() B C D A @@ -3422,7 +3554,9 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Examples -------- - >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) + >>> df = pd.DataFrame( + ... dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]) + ... ) >>> df.groupby("A").last() B C A @@ -3469,8 +3603,19 @@ def ohlc(self) -> DataFrame: For SeriesGroupBy: - >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',] - >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst) + >>> lst = [ + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... ] + >>> ser = pd.Series( + ... [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst + ... ) >>> ser SPX 3.4 CAC 9.0 @@ -3488,10 +3633,23 @@ def ohlc(self) -> DataFrame: For DataFrameGroupBy: - >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1], - ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]} - >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC', - ... 'SPX', 'CAC', 'SPX', 'CAC']) + >>> data = { + ... 2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2, 1], + ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0], + ... } + >>> df = pd.DataFrame( + ... data, + ... index=[ + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... ], + ... ) >>> df 2022 2023 SPX 1.2 3.4 @@ -3510,14 +3668,20 @@ def ohlc(self) -> DataFrame: For Resampler: - >>> ser = pd.Series([1, 3, 2, 4, 3, 5], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').ohlc() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").ohlc() open high low close 2023-01-01 1 3 1 2 2023-02-01 4 5 3 5 @@ -3632,10 +3796,10 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Examples -------- - >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') - >>> df = pd.DataFrame(data=4 * [range(2)], - ... index=idx, - ... columns=['a', 'b']) + >>> idx = pd.date_range("1/1/2000", periods=4, freq="min") + >>> df = pd.DataFrame( + ... data=4 * [range(2)], index=idx, columns=["a", "b"] + ... ) >>> df.iloc[2, 0] = 5 >>> df a b @@ -3647,7 +3811,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3min', include_groups=False).sum() + >>> df.groupby("a").resample("3min", include_groups=False).sum() b a 0 2000-01-01 00:00:00 2 @@ -3656,7 +3820,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30s', include_groups=False).sum() + >>> df.groupby("a").resample("30s", include_groups=False).sum() b a 0 2000-01-01 00:00:00 1 @@ -3670,7 +3834,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('ME', include_groups=False).sum() + >>> df.groupby("a").resample("ME", include_groups=False).sum() b a 0 2000-01-31 3 @@ -3680,8 +3844,8 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp side of the bin interval. >>> ( - ... df.groupby('a') - ... .resample('3min', closed='right', include_groups=False) + ... df.groupby("a") + ... .resample("3min", closed="right", include_groups=False) ... .sum() ... ) b @@ -3695,8 +3859,13 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp the left. >>> ( - ... df.groupby('a') - ... .resample('3min', closed='right', label='right', include_groups=False) + ... df.groupby("a") + ... .resample( + ... "3min", + ... closed="right", + ... label="right", + ... include_groups=False, + ... ) ... .sum() ... ) b @@ -3809,9 +3978,13 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 2], - ... 'B': [1, 2, 3, 4], - ... 'C': [0.362, 0.227, 1.267, -0.562]}) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362, 0.227, 1.267, -0.562], + ... } + ... ) >>> df A B C 0 1 1 0.362 @@ -3819,7 +3992,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: 2 2 3 1.267 3 2 4 -0.562 - >>> df.groupby('A').rolling(2).sum() + >>> df.groupby("A").rolling(2).sum() B C A 1 0 NaN NaN @@ -3827,7 +4000,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: 2 2 NaN NaN 3 7.0 0.705 - >>> df.groupby('A').rolling(2, min_periods=1).sum() + >>> df.groupby("A").rolling(2, min_periods=1).sum() B C A 1 0 1.0 0.362 @@ -3835,7 +4008,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: 2 2 3.0 1.267 3 7.0 0.705 - >>> df.groupby('A').rolling(2, on='B').sum() + >>> df.groupby("A").rolling(2, on="B").sum() B C A 1 0 1 NaN @@ -4099,7 +4272,7 @@ def bfill(self, limit: int | None = None): With Series: - >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'] + >>> index = ["Falcon", "Falcon", "Parrot", "Parrot", "Parrot"] >>> s = pd.Series([None, 1, None, None, 3], index=index) >>> s Falcon NaN @@ -4125,8 +4298,13 @@ def bfill(self, limit: int | None = None): With DataFrame: - >>> df = pd.DataFrame({'A': [1, None, None, None, 4], - ... 'B': [None, None, 5, None, 7]}, index=index) + >>> df = pd.DataFrame( + ... { + ... "A": [1, None, None, None, 4], + ... "B": [None, None, 5, None, 7], + ... }, + ... index=index, + ... ) >>> df A B Falcon 1.0 NaN @@ -4187,9 +4365,11 @@ def nth(self) -> GroupByNthSelector: Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], - ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) - >>> g = df.groupby('A') + >>> df = pd.DataFrame( + ... {"A": [1, 1, 2, 1, 2], "B": [np.nan, 2, 3, 4, 5]}, + ... columns=["A", "B"], + ... ) + >>> g = df.groupby("A") >>> g.nth(0) A B 0 1 NaN @@ -4230,7 +4410,7 @@ def nth(self) -> GroupByNthSelector: Specifying `dropna` allows ignoring ``NaN`` values - >>> g.nth(0, dropna='any') + >>> g.nth(0, dropna="any") A B 1 1 2.0 2 2 3.0 @@ -4238,7 +4418,7 @@ def nth(self) -> GroupByNthSelector: When the specified ``n`` is larger than any of the groups, an empty DataFrame is returned - >>> g.nth(3, dropna='any') + >>> g.nth(3, dropna="any") Empty DataFrame Columns: [A, B] Index: [] @@ -4341,11 +4521,18 @@ def quantile( Examples -------- - >>> df = pd.DataFrame([ - ... ['a', 1], ['a', 2], ['a', 3], - ... ['b', 1], ['b', 3], ['b', 5] - ... ], columns=['key', 'val']) - >>> df.groupby('key').quantile() + >>> df = pd.DataFrame( + ... [ + ... ["a", 1], + ... ["a", 2], + ... ["a", 3], + ... ["b", 1], + ... ["b", 3], + ... ["b", 5], + ... ], + ... columns=["key", "val"], + ... ) + >>> df.groupby("key").quantile() val key a 2.0 @@ -4561,7 +4748,9 @@ def ngroup(self, ascending: bool = True): Examples -------- - >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) + >>> df = pd.DataFrame( + ... {"color": ["red", None, "red", "blue", "blue", "red"]} + ... ) >>> df color 0 red @@ -4643,8 +4832,10 @@ def cumcount(self, ascending: bool = True): Examples -------- - >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], - ... columns=['A']) + >>> df = pd.DataFrame( + ... [["a"], ["a"], ["a"], ["b"], ["b"], ["a"]], + ... columns=["A"], + ... ) >>> df A 0 a @@ -4653,7 +4844,7 @@ def cumcount(self, ascending: bool = True): 3 b 4 b 5 a - >>> df.groupby('A').cumcount() + >>> df.groupby("A").cumcount() 0 0 1 1 2 2 @@ -4661,7 +4852,7 @@ def cumcount(self, ascending: bool = True): 4 1 5 3 dtype: int64 - >>> df.groupby('A').cumcount(ascending=False) + >>> df.groupby("A").cumcount(ascending=False) 0 3 1 2 2 1 @@ -4719,7 +4910,18 @@ def rank( -------- >>> df = pd.DataFrame( ... { - ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "group": [ + ... "a", + ... "a", + ... "a", + ... "a", + ... "a", + ... "b", + ... "b", + ... "b", + ... "b", + ... "b", + ... ], ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], ... } ... ) @@ -4735,8 +4937,10 @@ def rank( 7 b 4 8 b 1 9 b 5 - >>> for method in ['average', 'min', 'max', 'dense', 'first']: - ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) + >>> for method in ["average", "min", "max", "dense", "first"]: + ... df[f"{method}_rank"] = df.groupby("group")[ + ... "value" + ... ].rank(method) >>> df group value average_rank min_rank max_rank dense_rank first_rank 0 a 2 1.5 1.0 2.0 1.0 1.0 @@ -4799,7 +5003,7 @@ def cumprod( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([6, 2, 0], index=lst) >>> ser a 6 @@ -4815,8 +5019,11 @@ def cumprod( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"], + ... ) >>> df a b c cow 1 8 2 @@ -4860,7 +5067,7 @@ def cumsum( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([6, 2, 0], index=lst) >>> ser a 6 @@ -4876,8 +5083,11 @@ def cumsum( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["fox", "gorilla", "lion"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["fox", "gorilla", "lion"], + ... ) >>> df a b c fox 1 8 2 @@ -4924,7 +5134,7 @@ def cummin( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst) >>> ser a 1 @@ -4946,8 +5156,11 @@ def cummin( For DataFrameGroupBy: >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["snake", "rabbit", "turtle"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["snake", "rabbit", "turtle"], + ... ) >>> df a b c snake 1 0 2 @@ -4999,7 +5212,7 @@ def cummax( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst) >>> ser a 1 @@ -5021,8 +5234,11 @@ def cummax( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"], + ... ) >>> df a b c cow 1 8 2 @@ -5107,7 +5323,7 @@ def shift( For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -5125,8 +5341,11 @@ def shift( For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"], + ... ) >>> df a b c tuna 1 2 3 @@ -5176,7 +5395,10 @@ def shift( period = cast(int, period) if freq is not None or axis != 0: f = lambda x: x.shift( - period, freq, axis, fill_value # pylint: disable=cell-var-from-loop + period, + freq, + axis, + fill_value, # pylint: disable=cell-var-from-loop ) shifted = self._python_apply_general( f, self._selected_obj, is_transform=True @@ -5243,7 +5465,7 @@ def diff( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -5264,9 +5486,22 @@ def diff( For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = { + ... "a": [1, 3, 5, 7, 7, 8, 3], + ... "b": [1, 4, 8, 4, 4, 2, 1], + ... } + >>> df = pd.DataFrame( + ... data, + ... index=[ + ... "dog", + ... "dog", + ... "dog", + ... "mouse", + ... "mouse", + ... "mouse", + ... "mouse", + ... ], + ... ) >>> df a b dog 1 1 @@ -5335,7 +5570,7 @@ def pct_change( For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -5353,8 +5588,11 @@ def pct_change( For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"], + ... ) >>> df a b c tuna 1 2 3 @@ -5452,13 +5690,14 @@ def head(self, n: int = 5) -> NDFrameT: Examples -------- - >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], - ... columns=['A', 'B']) - >>> df.groupby('A').head(1) + >>> df = pd.DataFrame( + ... [[1, 2], [1, 4], [5, 6]], columns=["A", "B"] + ... ) + >>> df.groupby("A").head(1) A B 0 1 2 2 5 6 - >>> df.groupby('A').head(-1) + >>> df.groupby("A").head(-1) A B 0 1 2 """ @@ -5490,13 +5729,15 @@ def tail(self, n: int = 5) -> NDFrameT: Examples -------- - >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], - ... columns=['A', 'B']) - >>> df.groupby('A').tail(1) + >>> df = pd.DataFrame( + ... [["a", 1], ["a", 2], ["b", 1], ["b", 2]], + ... columns=["A", "B"], + ... ) + >>> df.groupby("A").tail(1) A B 1 a 2 3 b 2 - >>> df.groupby('A').tail(-1) + >>> df.groupby("A").tail(-1) A B 1 a 2 3 b 2 @@ -5683,7 +5924,10 @@ def sample( Examples -------- >>> df = pd.DataFrame( - ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} + ... { + ... "a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, + ... "b": range(6), + ... } ... ) >>> df a b diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e2224caad9e84..ce1158e276677 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -131,7 +131,13 @@ class Grouper: >>> df = pd.DataFrame( ... { - ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], + ... "Animal": [ + ... "Falcon", + ... "Parrot", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... ], ... "Speed": [100, 5, 200, 300, 15], ... } ... ) @@ -151,15 +157,15 @@ class Grouper: Specify a resample operation on the column 'Publish date' >>> df = pd.DataFrame( - ... { - ... "Publish date": [ + ... { + ... "Publish date": [ ... pd.Timestamp("2000-01-02"), ... pd.Timestamp("2000-01-02"), ... pd.Timestamp("2000-01-09"), - ... pd.Timestamp("2000-01-16") + ... pd.Timestamp("2000-01-16"), ... ], ... "ID": [0, 1, 2, 3], - ... "Price": [10, 20, 30, 40] + ... "Price": [10, 20, 30, 40], ... } ... ) >>> df @@ -177,8 +183,8 @@ class Grouper: If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - >>> rng = pd.date_range(start, end, freq='7min') + >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + >>> rng = pd.date_range(start, end, freq="7min") >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts 2000-10-01 23:30:00 0 @@ -192,7 +198,7 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 7min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min')).sum() + >>> ts.groupby(pd.Grouper(freq="17min")).sum() 2000-10-01 23:14:00 0 2000-10-01 23:31:00 9 2000-10-01 23:48:00 21 @@ -200,7 +206,7 @@ class Grouper: 2000-10-02 00:22:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", origin="epoch")).sum() 2000-10-01 23:18:00 0 2000-10-01 23:35:00 18 2000-10-01 23:52:00 27 @@ -208,7 +214,9 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + >>> ts.groupby( + ... pd.Grouper(freq="17min", origin="2000-01-01") + ... ).sum() 2000-10-01 23:24:00 3 2000-10-01 23:41:00 15 2000-10-01 23:58:00 45 @@ -218,14 +226,16 @@ class Grouper: If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: - >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", origin="start")).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() + >>> ts.groupby( + ... pd.Grouper(freq="17min", offset="23h30min") + ... ).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 @@ -235,7 +245,7 @@ class Grouper: To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: - >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", offset="2min")).sum() 2000-10-01 23:16:00 0 2000-10-01 23:33:00 9 2000-10-01 23:50:00 36 @@ -1008,7 +1018,8 @@ def is_in_obj(gpr) -> bool: return False if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): return gpr._mgr.references_same_values( # type: ignore[union-attr] - obj_gpr_column._mgr, 0 # type: ignore[arg-type] + obj_gpr_column._mgr, + 0, # type: ignore[arg-type] ) return False try: diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index a3c5ab8edc94e..87e3703d57990 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -99,8 +99,10 @@ def _positional_selector(self) -> GroupByPositionalSelector: Examples -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) + >>> df = pd.DataFrame( + ... [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"], + ... ) >>> df.groupby("A")._positional_selector[1:2] A B 1 a 2 diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index f2db4886a5590..df66bcae91b3f 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -51,7 +51,9 @@ class BaseIndexer: -------- >>> from pandas.api.indexers import BaseIndexer >>> class CustomIndexer(BaseIndexer): - ... def get_window_bounds(self, num_values, min_periods, center, closed, step): + ... def get_window_bounds( + ... self, num_values, min_periods, center, closed, step + ... ): ... start = np.empty(num_values, dtype=np.int64) ... end = np.empty(num_values, dtype=np.int64) ... for i in range(num_values): @@ -153,9 +155,13 @@ class VariableOffsetWindowIndexer(BaseIndexer): Examples -------- >>> from pandas.api.indexers import VariableOffsetWindowIndexer - >>> df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) + >>> df = pd.DataFrame( + ... range(10), index=pd.date_range("2020", periods=10) + ... ) >>> offset = pd.offsets.BDay(1) - >>> indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) + >>> indexer = VariableOffsetWindowIndexer( + ... index=df.index, offset=offset + ... ) >>> df 0 2020-01-01 0 @@ -300,7 +306,7 @@ class FixedForwardWindowIndexer(BaseIndexer): Examples -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) >>> df B 0 0.0 @@ -309,7 +315,9 @@ class FixedForwardWindowIndexer(BaseIndexer): 3 NaN 4 4.0 - >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) + >>> indexer = pd.api.indexers.FixedForwardWindowIndexer( + ... window_size=2 + ... ) >>> df.rolling(window=indexer, min_periods=1).sum() B 0 1.0 diff --git a/pandas/core/indexers/utils.py b/pandas/core/indexers/utils.py index 55bb58f3108c3..78dbe3a1ca632 100644 --- a/pandas/core/indexers/utils.py +++ b/pandas/core/indexers/utils.py @@ -202,7 +202,7 @@ def validate_indices(indices: np.ndarray, n: int) -> None: Examples -------- - >>> validate_indices(np.array([1, 2]), 3) # OK + >>> validate_indices(np.array([1, 2]), 3) # OK >>> validate_indices(np.array([1, -2]), 3) Traceback (most recent call last): @@ -214,7 +214,7 @@ def validate_indices(indices: np.ndarray, n: int) -> None: ... IndexError: indices are out-of-bounds - >>> validate_indices(np.array([-1, -1]), 0) # OK + >>> validate_indices(np.array([-1, -1]), 0) # OK >>> validate_indices(np.array([0, 1]), 0) Traceback (most recent call last): @@ -502,7 +502,7 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: For non-integer/boolean dtypes, an appropriate error is raised: - >>> indexer = np.array([0., 2.], dtype="float64") + >>> indexer = np.array([0.0, 2.0], dtype="float64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 10a3fcc61b5bc..54cdccd2ec11c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -260,7 +260,9 @@ class DatetimeProperties(Properties): Examples -------- - >>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) + >>> seconds_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="s") + ... ) >>> seconds_series 0 2000-01-01 00:00:00 1 2000-01-01 00:00:01 @@ -272,7 +274,9 @@ class DatetimeProperties(Properties): 2 2 dtype: int32 - >>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h")) + >>> hours_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="h") + ... ) >>> hours_series 0 2000-01-01 00:00:00 1 2000-01-01 01:00:00 @@ -284,7 +288,9 @@ class DatetimeProperties(Properties): 2 2 dtype: int32 - >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="QE")) + >>> quarters_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="QE") + ... ) >>> quarters_series 0 2000-03-31 1 2000-06-30 @@ -328,7 +334,7 @@ def to_pydatetime(self) -> np.ndarray: Examples -------- - >>> s = pd.Series(pd.date_range('20180310', periods=2)) + >>> s = pd.Series(pd.date_range("20180310", periods=2)) >>> s 0 2018-03-10 1 2018-03-11 @@ -340,7 +346,9 @@ def to_pydatetime(self) -> np.ndarray: pandas' nanosecond precision is truncated to microseconds. - >>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns')) + >>> s = pd.Series( + ... pd.date_range("20180310", periods=2, freq="ns") + ... ) >>> s 0 2018-03-10 00:00:00.000000000 1 2018-03-10 00:00:00.000000001 @@ -414,7 +422,9 @@ class TimedeltaProperties(Properties): Examples -------- >>> seconds_series = pd.Series( - ... pd.timedelta_range(start="1 second", periods=3, freq="s") + ... pd.timedelta_range( + ... start="1 second", periods=3, freq="s" + ... ) ... ) >>> seconds_series 0 0 days 00:00:01 @@ -476,7 +486,7 @@ def components(self): Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="s")) >>> s 0 0 days 00:00:00 1 0 days 00:00:01 @@ -520,7 +530,9 @@ class PeriodProperties(Properties): -------- >>> seconds_series = pd.Series( ... pd.period_range( - ... start="2000-01-01 00:00:00", end="2000-01-01 00:00:03", freq="s" + ... start="2000-01-01 00:00:00", + ... end="2000-01-01 00:00:03", + ... freq="s", ... ) ... ) >>> seconds_series @@ -537,7 +549,11 @@ class PeriodProperties(Properties): dtype: int64 >>> hours_series = pd.Series( - ... pd.period_range(start="2000-01-01 00:00", end="2000-01-01 03:00", freq="h") + ... pd.period_range( + ... start="2000-01-01 00:00", + ... end="2000-01-01 03:00", + ... freq="h", + ... ) ... ) >>> hours_series 0 2000-01-01 00:00 @@ -553,7 +569,9 @@ class PeriodProperties(Properties): dtype: int64 >>> quarters_series = pd.Series( - ... pd.period_range(start="2000-01-01", end="2000-12-31", freq="Q-DEC") + ... pd.period_range( + ... start="2000-01-01", end="2000-12-31", freq="Q-DEC" + ... ) ... ) >>> quarters_series 0 2000Q1 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9a5fae445df75..ea9fb01e412cd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -361,7 +361,7 @@ class Index(IndexOpsMixin, PandasObject): >>> pd.Index([1, 2, 3]) Index([1, 2, 3], dtype='int64') - >>> pd.Index(list('abc')) + >>> pd.Index(list("abc")) Index(['a', 'b', 'c'], dtype='object') >>> pd.Index([1, 2, 3], dtype="uint8") @@ -710,7 +710,7 @@ def _format_duplicate_message(self) -> DataFrame: Examples -------- - >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx = pd.Index(["a", "b", "a"]) >>> idx._format_duplicate_message() positions label @@ -797,7 +797,7 @@ def is_(self, other) -> bool: Examples -------- - >>> idx1 = pd.Index(['1', '2', '3']) + >>> idx1 = pd.Index(["1", "2", "3"]) >>> idx1.is_(idx1.view()) True @@ -986,7 +986,7 @@ def ravel(self, order: str_t = "C") -> Self: Examples -------- - >>> s = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + >>> s = pd.Series([1, 2, 3], index=["a", "b", "c"]) >>> s.index.ravel() Index(['a', 'b', 'c'], dtype='object') """ @@ -1056,7 +1056,7 @@ def astype(self, dtype, copy: bool = True): >>> idx = pd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='int64') - >>> idx.astype('float') + >>> idx.astype("float") Index([1.0, 2.0, 3.0], dtype='float64') """ if dtype is not None: @@ -1092,9 +1092,7 @@ def astype(self, dtype, copy: bool = True): result._references.add_index_reference(result) return result - _index_shared_docs[ - "take" - ] = """ + _index_shared_docs["take"] = """ Return a new %(klass)s of the values selected by the indices. For internal compatibility with numpy arrays. @@ -1181,9 +1179,7 @@ def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: allow_fill = False return allow_fill - _index_shared_docs[ - "repeat" - ] = """ + _index_shared_docs["repeat"] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -1260,7 +1256,7 @@ def copy( Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> new_idx = idx.copy() >>> idx is new_idx False @@ -1552,7 +1548,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: Examples -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx = pd.Index(["Ant", "Bear", "Cow"], name="animal") By default, the original index and original name is reused. @@ -1573,7 +1569,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: To override the name of the resulting column, specify ``name``: - >>> idx.to_series(name='zoo') + >>> idx.to_series(name="zoo") animal Ant Ant Bear Bear @@ -1616,7 +1612,7 @@ def to_frame( Examples -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx = pd.Index(["Ant", "Bear", "Cow"], name="animal") >>> idx.to_frame() animal animal @@ -1634,7 +1630,7 @@ def to_frame( To override the name of the resulting column, specify `name`: - >>> idx.to_frame(index=False, name='zoo') + >>> idx.to_frame(index=False, name="zoo") zoo 0 Ant 1 Bear @@ -1660,7 +1656,7 @@ def name(self) -> Hashable: Examples -------- - >>> idx = pd.Index([1, 2, 3], name='x') + >>> idx = pd.Index([1, 2, 3], name="x") >>> idx Index([1, 2, 3], dtype='int64', name='x') >>> idx.name @@ -1829,19 +1825,20 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: >>> idx = pd.Index([1, 2, 3, 4]) >>> idx Index([1, 2, 3, 4], dtype='int64') - >>> idx.set_names('quarter') + >>> idx.set_names("quarter") Index([1, 2, 3, 4], dtype='int64', name='quarter') - >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]]) + >>> idx = pd.MultiIndex.from_product( + ... [["python", "cobra"], [2018, 2019]] + ... ) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], ) - >>> idx = idx.set_names(['kind', 'year']) - >>> idx.set_names('species', level=0) + >>> idx = idx.set_names(["kind", "year"]) + >>> idx.set_names("species", level=0) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), @@ -1850,7 +1847,7 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: When renaming levels with a dict, levels can not be passed. - >>> idx.set_names({'kind': 'snake'}) + >>> idx.set_names({"kind": "snake"}) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), @@ -1922,26 +1919,27 @@ def rename(self, name, inplace: bool = False): Examples -------- - >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') - >>> idx.rename('grade') + >>> idx = pd.Index(["A", "C", "A", "B"], name="score") + >>> idx.rename("grade") Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') - >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]], - ... names=['kind', 'year']) + >>> idx = pd.MultiIndex.from_product( + ... [["python", "cobra"], [2018, 2019]], + ... names=["kind", "year"], + ... ) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['kind', 'year']) - >>> idx.rename(['species', 'year']) + >>> idx.rename(["species", "year"]) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['species', 'year']) - >>> idx.rename('species') + >>> idx.rename("species") Traceback (most recent call last): TypeError: Must pass list-like as `names`. """ @@ -2064,7 +2062,7 @@ def _get_level_values(self, level) -> Index: Examples -------- - >>> idx = pd.Index(list('abc')) + >>> idx = pd.Index(list("abc")) >>> idx Index(['a', 'b', 'c'], dtype='object') @@ -2099,7 +2097,8 @@ def droplevel(self, level: IndexLabel = 0): Examples -------- >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + ... [[1, 2], [3, 4], [5, 6]], names=["x", "y", "z"] + ... ) >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], @@ -2115,12 +2114,12 @@ def droplevel(self, level: IndexLabel = 0): (2, 4)], names=['x', 'y']) - >>> mi.droplevel('z') + >>> mi.droplevel("z") MultiIndex([(1, 3), (2, 4)], names=['x', 'y']) - >>> mi.droplevel(['x', 'y']) + >>> mi.droplevel(["x", "y"]) Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): @@ -2307,13 +2306,15 @@ def is_unique(self) -> bool: >>> idx.is_unique True - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index( + ... ["Watermelon", "Orange", "Apple", "Watermelon"] + ... ).astype("category") >>> idx.is_unique False - >>> idx = pd.Index(["Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index(["Orange", "Apple", "Watermelon"]).astype( + ... "category" + ... ) >>> idx.is_unique True """ @@ -2344,13 +2345,15 @@ def has_duplicates(self) -> bool: >>> idx.has_duplicates False - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index( + ... ["Watermelon", "Orange", "Apple", "Watermelon"] + ... ).astype("category") >>> idx.has_duplicates True - >>> idx = pd.Index(["Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index(["Orange", "Apple", "Watermelon"]).astype( + ... "category" + ... ) >>> idx.has_duplicates False """ @@ -2580,8 +2583,9 @@ def is_object(self) -> bool: >>> idx.is_object() # doctest: +SKIP True - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index( + ... ["Watermelon", "Orange", "Apple", "Watermelon"] + ... ).astype("category") >>> idx.is_object() # doctest: +SKIP False @@ -2622,8 +2626,9 @@ def is_categorical(self) -> bool: Examples -------- - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index( + ... ["Watermelon", "Orange", "Apple", "Watermelon"] + ... ).astype("category") >>> idx.is_categorical() # doctest: +SKIP True @@ -2675,8 +2680,12 @@ def is_interval(self) -> bool: Examples -------- - >>> idx = pd.Index([pd.Interval(left=0, right=5), - ... pd.Interval(left=5, right=10)]) + >>> idx = pd.Index( + ... [ + ... pd.Interval(left=0, right=5), + ... pd.Interval(left=5, right=10), + ... ] + ... ) >>> idx.is_interval() # doctest: +SKIP True @@ -2801,7 +2810,7 @@ def hasnans(self) -> bool: Examples -------- - >>> s = pd.Series([1, 2, 3], index=['a', 'b', None]) + >>> s = pd.Series([1, 2, 3], index=["a", "b", None]) >>> s a 1 b 2 @@ -2852,7 +2861,7 @@ def isna(self) -> npt.NDArray[np.bool_]: Empty strings are not considered NA values. None is considered an NA value. - >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx = pd.Index(["black", "", "red", None]) >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.isna() @@ -2860,8 +2869,14 @@ def isna(self) -> npt.NDArray[np.bool_]: For datetimes, `NaT` (Not a Time) is considered as an NA value. - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) + >>> idx = pd.DatetimeIndex( + ... [ + ... pd.Timestamp("1940-04-25"), + ... pd.Timestamp(""), + ... None, + ... pd.NaT, + ... ] + ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], dtype='datetime64[ns]', freq=None) @@ -2908,7 +2923,7 @@ def notna(self) -> npt.NDArray[np.bool_]: Empty strings are not considered NA values. None is considered a NA value. - >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx = pd.Index(["black", "", "red", None]) >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.notna() @@ -3068,19 +3083,21 @@ def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: -------- Generate an pandas.Index with duplicate values. - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + >>> idx = pd.Index( + ... ["llama", "cow", "llama", "beetle", "llama", "hippo"] + ... ) The `keep` parameter controls which duplicate values are removed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. - >>> idx.drop_duplicates(keep='first') + >>> idx.drop_duplicates(keep="first") Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') The value 'last' keeps the last occurrence for each set of duplicated entries. - >>> idx.drop_duplicates(keep='last') + >>> idx.drop_duplicates(keep="last") Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') The value ``False`` discards all sets of duplicated entries. @@ -3127,19 +3144,19 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: By default, for each set of duplicated values, the first occurrence is set to False and all others to True: - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx = pd.Index(["llama", "cow", "llama", "beetle", "llama"]) >>> idx.duplicated() array([False, False, True, False, True]) which is equivalent to - >>> idx.duplicated(keep='first') + >>> idx.duplicated(keep="first") array([False, False, True, False, True]) By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True: - >>> idx.duplicated(keep='last') + >>> idx.duplicated(keep="last") array([ True, False, True, False, False]) By setting keep on ``False``, all duplicates are True: @@ -3248,7 +3265,7 @@ def union(self, other, sort=None): Union mismatched dtypes - >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx1 = pd.Index(["a", "b", "c", "d"]) >>> idx2 = pd.Index([1, 2, 3, 4]) >>> idx1.union(idx2) Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') @@ -3759,16 +3776,16 @@ def get_loc(self, key): Examples -------- - >>> unique_index = pd.Index(list('abc')) - >>> unique_index.get_loc('b') + >>> unique_index = pd.Index(list("abc")) + >>> unique_index.get_loc("b") 1 - >>> monotonic_index = pd.Index(list('abbc')) - >>> monotonic_index.get_loc('b') + >>> monotonic_index = pd.Index(list("abbc")) + >>> monotonic_index.get_loc("b") slice(1, 3, None) - >>> non_monotonic_index = pd.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') + >>> non_monotonic_index = pd.Index(list("abcb")) + >>> non_monotonic_index.get_loc("b") array([False, True, False, True]) """ casted_key = self._maybe_cast_indexer(key) @@ -3839,8 +3856,8 @@ def get_indexer( Examples -------- - >>> index = pd.Index(['c', 'a', 'b']) - >>> index.get_indexer(['a', 'b', 'x']) + >>> index = pd.Index(["c", "a", "b"]) + >>> index.get_indexer(["a", "b", "x"]) array([ 1, 2, -1]) Notice that the return value is an array of locations in ``index`` @@ -4350,10 +4367,10 @@ def reindex( Examples -------- - >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx = pd.Index(["car", "bike", "train", "tractor"]) >>> idx Index(['car', 'bike', 'train', 'tractor'], dtype='object') - >>> idx.reindex(['car', 'bike']) + >>> idx.reindex(["car", "bike"]) (Index(['car', 'bike'], dtype='object'), array([0, 1])) """ # GH6552: preserve names when reindexing to non-named target @@ -4557,7 +4574,7 @@ def join( -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) - >>> idx1.join(idx2, how='outer') + >>> idx1.join(idx2, how="outer") Index([1, 2, 3, 4, 5, 6], dtype='int64') """ other = ensure_index(other) @@ -5237,10 +5254,10 @@ def where(self, cond, other=None) -> Index: Examples -------- - >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx = pd.Index(["car", "bike", "train", "tractor"]) >>> idx Index(['car', 'bike', 'train', 'tractor'], dtype='object') - >>> idx.where(idx.isin(['car', 'train']), 'other') + >>> idx.where(idx.isin(["car", "train"]), "other") Index(['car', 'other', 'train', 'other'], dtype='object') """ if isinstance(self, ABCMultiIndex): @@ -5569,10 +5586,10 @@ def equals(self, other: Any) -> bool: The dtype is *not* compared - >>> int64_idx = pd.Index([1, 2, 3], dtype='int64') + >>> int64_idx = pd.Index([1, 2, 3], dtype="int64") >>> int64_idx Index([1, 2, 3], dtype='int64') - >>> uint64_idx = pd.Index([1, 2, 3], dtype='uint64') + >>> uint64_idx = pd.Index([1, 2, 3], dtype="uint64") >>> uint64_idx Index([1, 2, 3], dtype='uint64') >>> int64_idx.equals(uint64_idx) @@ -5631,13 +5648,13 @@ def identical(self, other) -> bool: Examples -------- - >>> idx1 = pd.Index(['1', '2', '3']) - >>> idx2 = pd.Index(['1', '2', '3']) + >>> idx1 = pd.Index(["1", "2", "3"]) + >>> idx2 = pd.Index(["1", "2", "3"]) >>> idx2.identical(idx1) True - >>> idx1 = pd.Index(['1', '2', '3'], name="A") - >>> idx2 = pd.Index(['1', '2', '3'], name="B") + >>> idx1 = pd.Index(["1", "2", "3"], name="A") + >>> idx2 = pd.Index(["1", "2", "3"], name="B") >>> idx2.identical(idx1) False """ @@ -5685,26 +5702,27 @@ def asof(self, label): -------- `Index.asof` returns the latest index label up to the passed label. - >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) - >>> idx.asof('2014-01-01') + >>> idx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"]) + >>> idx.asof("2014-01-01") '2013-12-31' If the label is in the index, the method returns the passed label. - >>> idx.asof('2014-01-02') + >>> idx.asof("2014-01-02") '2014-01-02' If all of the labels in the index are later than the passed label, NaN is returned. - >>> idx.asof('1999-01-02') + >>> idx.asof("1999-01-02") nan If the index is not sorted, an error is raised. - >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', - ... '2014-01-03']) - >>> idx_not_sorted.asof('2013-12-31') + >>> idx_not_sorted = pd.Index( + ... ["2013-12-31", "2015-01-02", "2014-01-03"] + ... ) + >>> idx_not_sorted.asof("2013-12-31") Traceback (most recent call last): ValueError: index must be monotonic increasing or decreasing """ @@ -5764,9 +5782,14 @@ def asof_locs( Examples -------- - >>> idx = pd.date_range('2023-06-01', periods=3, freq='D') - >>> where = pd.DatetimeIndex(['2023-05-30 00:12:00', '2023-06-01 00:00:00', - ... '2023-06-02 23:59:59']) + >>> idx = pd.date_range("2023-06-01", periods=3, freq="D") + >>> where = pd.DatetimeIndex( + ... [ + ... "2023-05-30 00:12:00", + ... "2023-06-01 00:00:00", + ... "2023-06-02 23:59:59", + ... ] + ... ) >>> mask = np.ones(3, dtype=bool) >>> idx.asof_locs(where, mask) array([-1, 0, 1]) @@ -5781,7 +5804,8 @@ def asof_locs( # types "Union[ExtensionArray, ndarray[Any, Any]]", "str" # TODO: will be fixed when ExtensionArray.searchsorted() is fixed locs = self._values[mask].searchsorted( - where._values, side="right" # type: ignore[call-overload] + where._values, + side="right", # type: ignore[call-overload] ) locs = np.where(locs > 0, locs - 1, 0) @@ -5924,7 +5948,9 @@ def shift(self, periods: int = 1, freq=None): -------- Put the first 5 month starts of 2011 into an index. - >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts = pd.date_range( + ... "1/1/2011", periods=5, freq="MS" + ... ) >>> month_starts DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', '2011-05-01'], @@ -5932,7 +5958,7 @@ def shift(self, periods: int = 1, freq=None): Shift the index by 10 days. - >>> month_starts.shift(10, freq='D') + >>> month_starts.shift(10, freq="D") DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', '2011-05-11'], dtype='datetime64[ns]', freq=None) @@ -5974,7 +6000,7 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: Examples -------- - >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx = pd.Index(["b", "a", "d", "c"]) >>> idx Index(['b', 'a', 'd', 'c'], dtype='object') @@ -6007,9 +6033,7 @@ def _should_fallback_to_positional(self) -> bool: "complex", } - _index_shared_docs[ - "get_indexer_non_unique" - ] = """ + _index_shared_docs["get_indexer_non_unique"] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -6111,7 +6135,7 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]: Examples -------- - >>> idx = pd.Index([np.nan, 'var1', np.nan]) + >>> idx = pd.Index([np.nan, "var1", np.nan]) >>> idx.get_indexer_for([np.nan]) array([0, 2]) """ @@ -6410,16 +6434,16 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): Examples -------- >>> idx = pd.Index([1, 2, 3]) - >>> idx.map({1: 'a', 2: 'b', 3: 'c'}) + >>> idx.map({1: "a", 2: "b", 3: "c"}) Index(['a', 'b', 'c'], dtype='object') Using `map` with a function: >>> idx = pd.Index([1, 2, 3]) - >>> idx.map('I am a {}'.format) + >>> idx.map("I am a {}".format) Index(['I am a 1', 'I am a 2', 'I am a 3'], dtype='object') - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> idx.map(lambda x: x.upper()) Index(['A', 'B', 'C'], dtype='object') """ @@ -6514,7 +6538,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: Examples -------- - >>> idx = pd.Index([1,2,3]) + >>> idx = pd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='int64') @@ -6523,9 +6547,10 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: >>> idx.isin([1, 4]) array([ True, False, False]) - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color')) + >>> midx = pd.MultiIndex.from_arrays( + ... [[1, 2, 3], ["red", "blue", "green"]], + ... names=("number", "color"), + ... ) >>> midx MultiIndex([(1, 'red'), (2, 'blue'), @@ -6535,12 +6560,12 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: Check whether the strings in the 'color' level of the MultiIndex are in a list of colors. - >>> midx.isin(['red', 'orange', 'yellow'], level='color') + >>> midx.isin(["red", "orange", "yellow"], level="color") array([ True, False, False]) To check across the levels of a MultiIndex, pass a list of tuples: - >>> midx.isin([(1, 'red'), (3, 'red')]) + >>> midx.isin([(1, "red"), (3, "red")]) array([ True, False, False]) """ if level is not None: @@ -6588,12 +6613,14 @@ def slice_indexer( -------- This is a method on all index types. For example you can do: - >>> idx = pd.Index(list('abcd')) - >>> idx.slice_indexer(start='b', end='c') + >>> idx = pd.Index(list("abcd")) + >>> idx.slice_indexer(start="b", end="c") slice(1, 3, None) - >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) - >>> idx.slice_indexer(start='b', end=('c', 'g')) + >>> idx = pd.MultiIndex.from_arrays( + ... [list("abcd"), list("efgh")] + ... ) + >>> idx.slice_indexer(start="b", end=("c", "g")) slice(1, 3, None) """ start_slice, end_slice = self.slice_locs(start, end, step=step) @@ -6704,16 +6731,16 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: Examples -------- >>> idx = pd.RangeIndex(5) - >>> idx.get_slice_bound(3, 'left') + >>> idx.get_slice_bound(3, "left") 3 - >>> idx.get_slice_bound(3, 'right') + >>> idx.get_slice_bound(3, "right") 4 If ``label`` is non-unique in the index, an error will be raised. - >>> idx_duplicate = pd.Index(['a', 'b', 'a', 'c', 'd']) - >>> idx_duplicate.get_slice_bound('a', 'left') + >>> idx_duplicate = pd.Index(["a", "b", "a", "c", "d"]) + >>> idx_duplicate.get_slice_bound("a", "left") Traceback (most recent call last): KeyError: Cannot get left slice bound for non-unique label: 'a' """ @@ -6789,8 +6816,8 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Examples -------- - >>> idx = pd.Index(list('abcd')) - >>> idx.slice_locs(start='b', end='c') + >>> idx = pd.Index(list("abcd")) + >>> idx.slice_locs(start="b", end="c") (1, 3) """ inc = step is None or step >= 0 @@ -6871,11 +6898,11 @@ def delete(self, loc) -> Self: Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> idx.delete(1) Index(['a', 'c'], dtype='object') - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> idx.delete([0, 2]) Index(['b'], dtype='object') """ @@ -6907,8 +6934,8 @@ def insert(self, loc: int, item) -> Index: Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) - >>> idx.insert(1, 'x') + >>> idx = pd.Index(["a", "b", "c"]) + >>> idx.insert(1, "x") Index(['a', 'x', 'b', 'c'], dtype='object') """ item = lib.item_from_zerodim(item) @@ -6991,8 +7018,8 @@ def drop( Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) - >>> idx.drop(['a']) + >>> idx = pd.Index(["a", "b", "c"]) + >>> idx.drop(["a"]) Index(['b', 'c'], dtype='object') """ if not isinstance(labels, Index): @@ -7088,7 +7115,9 @@ def round(self, decimals: int = 0) -> Self: Examples -------- >>> import pandas as pd - >>> idx = pd.Index([10.1234, 20.5678, 30.9123, 40.4567, 50.7890]) + >>> idx = pd.Index( + ... [10.1234, 20.5678, 30.9123, 40.4567, 50.7890] + ... ) >>> idx.round(decimals=2) Index([10.12, 20.57, 30.91, 40.46, 50.79], dtype='float64') @@ -7370,13 +7399,13 @@ def min(self, axis=None, skipna: bool = True, *args, **kwargs): >>> idx.min() 1 - >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx = pd.Index(["c", "b", "a"]) >>> idx.min() 'a' For a MultiIndex, the minimum is determined lexicographically. - >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx = pd.MultiIndex.from_product([("a", "b"), (2, 1)]) >>> idx.min() ('a', 1) """ @@ -7433,13 +7462,13 @@ def max(self, axis=None, skipna: bool = True, *args, **kwargs): >>> idx.max() 3 - >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx = pd.Index(["c", "b", "a"]) >>> idx.max() 'c' For a MultiIndex, the maximum is determined lexicographically. - >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx = pd.MultiIndex.from_product([("a", "b"), (2, 1)]) >>> idx.max() ('b', 2) """ @@ -7508,7 +7537,9 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"]) Index([1, 2, 3], dtype='int64', name='name') - >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) + >>> ensure_index_from_sequences( + ... [["a", "a"], ["a", "b"]], names=["L1", "L2"] + ... ) MultiIndex([('a', 'a'), ('a', 'b')], names=['L1', 'L2']) @@ -7547,13 +7578,13 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: Examples -------- - >>> ensure_index(['a', 'b']) + >>> ensure_index(["a", "b"]) Index(['a', 'b'], dtype='object') - >>> ensure_index([('a', 'a'), ('b', 'c')]) + >>> ensure_index([("a", "a"), ("b", "c")]) Index([('a', 'a'), ('b', 'c')], dtype='object') - >>> ensure_index([['a', 'a'], ['b', 'c']]) + >>> ensure_index([["a", "a"], ["b", "c"]]) MultiIndex([('a', 'b'), ('a', 'c')], ) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b307be004ad6e..1f4e9a66057f5 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -160,7 +160,9 @@ class CategoricalIndex(NDArrayBackedExtensionIndex): Ordered ``CategoricalIndex`` can have a min and max value. >>> ci = pd.CategoricalIndex( - ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"] + ... ["a", "b", "c", "a", "b", "c"], + ... ordered=True, + ... categories=["c", "b", "a"], ... ) >>> ci CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], @@ -284,14 +286,16 @@ def equals(self, other: object) -> bool: Examples -------- - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) - >>> ci2 = pd.CategoricalIndex(pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) + >>> ci2 = pd.CategoricalIndex( + ... pd.Categorical(["a", "b", "c", "a", "b", "c"]) + ... ) >>> ci.equals(ci2) True The order of elements matters. - >>> ci3 = pd.CategoricalIndex(['c', 'b', 'a', 'a', 'b', 'c']) + >>> ci3 = pd.CategoricalIndex(["c", "b", "a", "a", "b", "c"]) >>> ci.equals(ci3) False @@ -304,16 +308,17 @@ def equals(self, other: object) -> bool: The categories matter, but the order of the categories matters only when ``ordered=True``. - >>> ci5 = ci.set_categories(['a', 'b', 'c', 'd']) + >>> ci5 = ci.set_categories(["a", "b", "c", "d"]) >>> ci.equals(ci5) False - >>> ci6 = ci.set_categories(['b', 'c', 'a']) + >>> ci6 = ci.set_categories(["b", "c", "a"]) >>> ci.equals(ci6) True - >>> ci_ordered = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - ... ordered=True) - >>> ci2_ordered = ci_ordered.set_categories(['b', 'c', 'a']) + >>> ci_ordered = pd.CategoricalIndex( + ... ["a", "b", "c", "a", "b", "c"], ordered=True + ... ) + >>> ci2_ordered = ci_ordered.set_categories(["b", "c", "a"]) >>> ci_ordered.equals(ci2_ordered) False """ @@ -462,37 +467,37 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): Examples -------- - >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) + >>> idx = pd.CategoricalIndex(["a", "b", "c"]) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') >>> idx.map(lambda x: x.upper()) CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=False, dtype='category') - >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) + >>> idx.map({"a": "first", "b": "second", "c": "third"}) CategoricalIndex(['first', 'second', 'third'], categories=['first', 'second', 'third'], ordered=False, dtype='category') If the mapping is one-to-one the ordering of the categories is preserved: - >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) + >>> idx = pd.CategoricalIndex(["a", "b", "c"], ordered=True) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=True, dtype='category') - >>> idx.map({'a': 3, 'b': 2, 'c': 1}) + >>> idx.map({"a": 3, "b": 2, "c": 1}) CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, dtype='category') If the mapping is not one-to-one an :class:`~pandas.Index` is returned: - >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'}) + >>> idx.map({"a": "first", "b": "second", "c": "first"}) Index(['first', 'second', 'first'], dtype='object') If a `dict` is used, all unmapped categories are mapped to `NaN` and the result is an :class:`~pandas.Index`: - >>> idx.map({'a': 'first', 'b': 'second'}) + >>> idx.map({"a": "first", "b": "second"}) Index(['first', 'second', nan], dtype='object') """ mapped = self._values.map(mapper, na_action=na_action) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 2b03a64236128..ba39a0ca92904 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -462,20 +462,20 @@ def as_unit(self, unit: str) -> Self: -------- For :class:`pandas.DatetimeIndex`: - >>> idx = pd.DatetimeIndex(['2020-01-02 01:02:03.004005006']) + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) >>> idx DatetimeIndex(['2020-01-02 01:02:03.004005006'], dtype='datetime64[ns]', freq=None) - >>> idx.as_unit('s') + >>> idx.as_unit("s") DatetimeIndex(['2020-01-02 01:02:03'], dtype='datetime64[s]', freq=None) For :class:`pandas.TimedeltaIndex`: - >>> tdelta_idx = pd.to_timedelta(['1 day 3 min 2 us 42 ns']) + >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) >>> tdelta_idx TimedeltaIndex(['1 days 00:03:00.000002042'], dtype='timedelta64[ns]', freq=None) - >>> tdelta_idx.as_unit('s') + >>> tdelta_idx.as_unit("s") TimedeltaIndex(['1 days 00:03:00'], dtype='timedelta64[s]', freq=None) """ arr = self._data.as_unit(unit) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 73143730085d6..e06c34ec98654 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -249,7 +249,9 @@ class DatetimeIndex(DatetimeTimedeltaMixin): Examples -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) @@ -485,12 +487,13 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: Examples -------- - >>> idx = pd.DatetimeIndex(['2023-01-01', '2023-01-02', - ... '2023-02-01', '2023-02-02']) + >>> idx = pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"] + ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], dtype='datetime64[ns]', freq=None) - >>> idx.snap('MS') + >>> idx.snap("MS") DatetimeIndex(['2023-01-01', '2023-01-01', '2023-02-01', '2023-02-01'], dtype='datetime64[ns]', freq=None) """ @@ -732,8 +735,9 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: Examples -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00", "2/1/2020 11:00", - ... "3/1/2020 10:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00", "2/1/2020 11:00", "3/1/2020 10:00"] + ... ) >>> idx.indexer_at_time("10:00") array([0, 2]) """ @@ -901,7 +905,7 @@ def date_range( Specify `start` and `end`, with the default daily frequency. - >>> pd.date_range(start='1/1/2018', end='1/08/2018') + >>> pd.date_range(start="1/1/2018", end="1/08/2018") DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], dtype='datetime64[ns]', freq='D') @@ -909,8 +913,12 @@ def date_range( Specify timezone-aware `start` and `end`, with the default daily frequency. >>> pd.date_range( - ... start=pd.to_datetime("1/1/2018").tz_localize("Europe/Berlin"), - ... end=pd.to_datetime("1/08/2018").tz_localize("Europe/Berlin"), + ... start=pd.to_datetime("1/1/2018").tz_localize( + ... "Europe/Berlin" + ... ), + ... end=pd.to_datetime("1/08/2018").tz_localize( + ... "Europe/Berlin" + ... ), ... ) DatetimeIndex(['2018-01-01 00:00:00+01:00', '2018-01-02 00:00:00+01:00', '2018-01-03 00:00:00+01:00', '2018-01-04 00:00:00+01:00', @@ -920,14 +928,14 @@ def date_range( Specify `start` and `periods`, the number of periods (days). - >>> pd.date_range(start='1/1/2018', periods=8) + >>> pd.date_range(start="1/1/2018", periods=8) DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], dtype='datetime64[ns]', freq='D') Specify `end` and `periods`, the number of periods (days). - >>> pd.date_range(end='1/1/2018', periods=8) + >>> pd.date_range(end="1/1/2018", periods=8) DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq='D') @@ -935,7 +943,9 @@ def date_range( Specify `start`, `end`, and `periods`; the frequency is generated automatically (linearly spaced). - >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) + >>> pd.date_range( + ... start="2018-04-24", end="2018-04-27", periods=3 + ... ) DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', '2018-04-27 00:00:00'], dtype='datetime64[ns]', freq=None) @@ -944,28 +954,30 @@ def date_range( Changed the `freq` (frequency) to ``'ME'`` (month end frequency). - >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') + >>> pd.date_range(start="1/1/2018", periods=5, freq="ME") DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], dtype='datetime64[ns]', freq='ME') Multiples are allowed - >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') + >>> pd.date_range(start="1/1/2018", periods=5, freq="3ME") DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], dtype='datetime64[ns]', freq='3ME') `freq` can also be specified as an Offset object. - >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) + >>> pd.date_range( + ... start="1/1/2018", periods=5, freq=pd.offsets.MonthEnd(3) + ... ) DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], dtype='datetime64[ns]', freq='3ME') Specify `tz` to set the timezone. - >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo') + >>> pd.date_range(start="1/1/2018", periods=5, tz="Asia/Tokyo") DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', '2018-01-05 00:00:00+09:00'], @@ -974,26 +986,34 @@ def date_range( `inclusive` controls whether to include `start` and `end` that are on the boundary. The default, "both", includes boundary points on either end. - >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both") + >>> pd.date_range( + ... start="2017-01-01", end="2017-01-04", inclusive="both" + ... ) DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. - >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left') + >>> pd.date_range( + ... start="2017-01-01", end="2017-01-04", inclusive="left" + ... ) DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq='D') Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and similarly ``inclusive='neither'`` will exclude both `start` and `end`. - >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right') + >>> pd.date_range( + ... start="2017-01-01", end="2017-01-04", inclusive="right" + ... ) DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') **Specify a unit** - >>> pd.date_range(start="2017-01-01", periods=10, freq="100YS", unit="s") + >>> pd.date_range( + ... start="2017-01-01", periods=10, freq="100YS", unit="s" + ... ) DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01', '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01', '2817-01-01', '2917-01-01'], @@ -1083,7 +1103,7 @@ def bdate_range( -------- Note how the two weekend days are skipped in the result. - >>> pd.bdate_range(start='1/1/2018', end='1/08/2018') + >>> pd.bdate_range(start="1/1/2018", end="1/08/2018") DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-08'], dtype='datetime64[ns]', freq='B') diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4fcdb87974511..728e774e82f17 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -470,7 +470,9 @@ def is_overlapping(self) -> bool: Examples -------- - >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)]) + >>> index = pd.IntervalIndex.from_tuples( + ... [(0, 2), (1, 3), (4, 5)] + ... ) >>> index IntervalIndex([(0, 2], (1, 3], (4, 5]], dtype='interval[int64, right]') @@ -479,7 +481,7 @@ def is_overlapping(self) -> bool: Intervals that share closed endpoints overlap: - >>> index = pd.interval_range(0, 3, closed='both') + >>> index = pd.interval_range(0, 3, closed="both") >>> index IntervalIndex([[0, 1], [1, 2], [2, 3]], dtype='interval[int64, both]') @@ -488,7 +490,7 @@ def is_overlapping(self) -> bool: Intervals that only have an open endpoint in common do not overlap: - >>> index = pd.interval_range(0, 3, closed='left') + >>> index = pd.interval_range(0, 3, closed="left") >>> index IntervalIndex([[0, 1), [1, 2), [2, 3)], dtype='interval[int64, left]') @@ -555,9 +557,7 @@ def _maybe_convert_i8(self, key): right = self._maybe_convert_i8(key.right) constructor = Interval if scalar else IntervalIndex.from_arrays # error: "object" not callable - return constructor( - left, right, closed=self.closed - ) # type: ignore[operator] + return constructor(left, right, closed=self.closed) # type: ignore[operator] if scalar: # Timestamp/Timedelta @@ -1019,8 +1019,10 @@ def interval_range( Additionally, datetime-like input is also supported. - >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), - ... end=pd.Timestamp('2017-01-04')) + >>> pd.interval_range( + ... start=pd.Timestamp("2017-01-01"), + ... end=pd.Timestamp("2017-01-04"), + ... ) IntervalIndex([(2017-01-01 00:00:00, 2017-01-02 00:00:00], (2017-01-02 00:00:00, 2017-01-03 00:00:00], (2017-01-03 00:00:00, 2017-01-04 00:00:00]], @@ -1037,8 +1039,9 @@ def interval_range( Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. - >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), - ... periods=3, freq='MS') + >>> pd.interval_range( + ... start=pd.Timestamp("2017-01-01"), periods=3, freq="MS" + ... ) IntervalIndex([(2017-01-01 00:00:00, 2017-02-01 00:00:00], (2017-02-01 00:00:00, 2017-03-01 00:00:00], (2017-03-01 00:00:00, 2017-04-01 00:00:00]], @@ -1054,7 +1057,7 @@ def interval_range( The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. - >>> pd.interval_range(end=5, periods=4, closed='both') + >>> pd.interval_range(end=5, periods=4, closed="both") IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], dtype='interval[int64, both]') """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 46343b84afb43..cc5af2ca9afd6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -296,8 +296,8 @@ class MultiIndex(Index): methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + >>> arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + >>> pd.MultiIndex.from_arrays(arrays, names=("number", "color")) MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), @@ -505,8 +505,8 @@ def from_arrays( Examples -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + >>> arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + >>> pd.MultiIndex.from_arrays(arrays, names=("number", "color")) MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), @@ -576,9 +576,8 @@ def from_tuples( Examples -------- - >>> tuples = [(1, 'red'), (1, 'blue'), - ... (2, 'red'), (2, 'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + >>> tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + >>> pd.MultiIndex.from_tuples(tuples, names=("number", "color")) MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), @@ -658,9 +657,10 @@ def from_product( Examples -------- >>> numbers = [0, 1, 2] - >>> colors = ['green', 'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - ... names=['number', 'color']) + >>> colors = ["green", "purple"] + >>> pd.MultiIndex.from_product( + ... [numbers, colors], names=["number", "color"] + ... ) MultiIndex([(0, 'green'), (0, 'purple'), (1, 'green'), @@ -720,9 +720,15 @@ def from_frame( Examples -------- - >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], - ... ['NJ', 'Temp'], ['NJ', 'Precip']], - ... columns=['a', 'b']) + >>> df = pd.DataFrame( + ... [ + ... ["HI", "Temp"], + ... ["HI", "Precip"], + ... ["NJ", "Temp"], + ... ["NJ", "Precip"], + ... ], + ... columns=["a", "b"], + ... ) >>> df a b 0 HI Temp @@ -739,7 +745,7 @@ def from_frame( Using explicit names, instead of the column names - >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + >>> pd.MultiIndex.from_frame(df, names=["state", "observation"]) MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), @@ -807,8 +813,10 @@ def dtypes(self) -> Series: Examples -------- - >>> idx = pd.MultiIndex.from_product([(0, 1, 2), ('green', 'purple')], - ... names=['number', 'color']) + >>> idx = pd.MultiIndex.from_product( + ... [(0, 1, 2), ("green", "purple")], + ... names=["number", "color"], + ... ) >>> idx MultiIndex([(0, 'green'), (0, 'purple'), @@ -861,10 +869,13 @@ def levels(self) -> FrozenList: Examples -------- - >>> index = pd.MultiIndex.from_product([['mammal'], - ... ('goat', 'human', 'cat', 'dog')], - ... names=['Category', 'Animals']) - >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) + >>> index = pd.MultiIndex.from_product( + ... [["mammal"], ("goat", "human", "cat", "dog")], + ... names=["Category", "Animals"], + ... ) + >>> leg_num = pd.DataFrame( + ... data=(4, 2, 4, 4), index=index, columns=["Legs"] + ... ) >>> leg_num Legs Category Animals @@ -973,9 +984,9 @@ def set_levels( ... (2, "one"), ... (2, "two"), ... (3, "one"), - ... (3, "two") + ... (3, "two"), ... ], - ... names=["foo", "bar"] + ... names=["foo", "bar"], ... ) >>> idx MultiIndex([(1, 'one'), @@ -986,7 +997,7 @@ def set_levels( (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) + >>> idx.set_levels([["a", "b", "c"], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), @@ -994,7 +1005,7 @@ def set_levels( ('c', 1), ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b', 'c'], level=0) + >>> idx.set_levels(["a", "b", "c"], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), @@ -1002,7 +1013,7 @@ def set_levels( ('c', 'one'), ('c', 'two')], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level='bar') + >>> idx.set_levels(["a", "b"], level="bar") MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), @@ -1016,7 +1027,9 @@ def set_levels( be stored in the MultiIndex levels, though the values will be truncated in the MultiIndex output. - >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) + >>> idx.set_levels( + ... [["a", "b", "c"], [1, 2, 3, 4]], level=[0, 1] + ... ) MultiIndex([('a', 1), ('a', 2), ('b', 1), @@ -1024,7 +1037,9 @@ def set_levels( ('c', 1), ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + >>> idx.set_levels( + ... [["a", "b", "c"], [1, 2, 3, 4]], level=[0, 1] + ... ).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ @@ -1050,7 +1065,7 @@ def nlevels(self) -> int: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) >>> mi MultiIndex([('a', 'b', 'c')], ) @@ -1066,7 +1081,7 @@ def levshape(self) -> Shape: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) >>> mi MultiIndex([('a', 'b', 'c')], ) @@ -1146,7 +1161,8 @@ def set_codes( Examples -------- >>> idx = pd.MultiIndex.from_tuples( - ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"] + ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], + ... names=["foo", "bar"], ... ) >>> idx MultiIndex([(1, 'one'), @@ -1167,7 +1183,7 @@ def set_codes( (2, 'one'), (1, 'two')], names=['foo', 'bar']) - >>> idx.set_codes([0, 0, 1, 1], level='bar') + >>> idx.set_codes([0, 0, 1, 1], level="bar") MultiIndex([(1, 'one'), (1, 'one'), (2, 'two'), @@ -1275,7 +1291,7 @@ def copy( # type: ignore[override] Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) >>> mi MultiIndex([('a', 'b', 'c')], ) @@ -1809,14 +1825,14 @@ def get_level_values(self, level) -> Index: # type: ignore[override] -------- Create a MultiIndex: - >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) - >>> mi.names = ['level_1', 'level_2'] + >>> mi = pd.MultiIndex.from_arrays((list("abc"), list("def"))) + >>> mi.names = ["level_1", "level_2"] Get level values by supplying level as either integer or name: >>> mi.get_level_values(0) Index(['a', 'b', 'c'], dtype='object', name='level_1') - >>> mi.get_level_values('level_2') + >>> mi.get_level_values("level_2") Index(['d', 'e', 'f'], dtype='object', name='level_2') If a level contains missing values, the return type of the level @@ -1826,7 +1842,9 @@ def get_level_values(self, level) -> Index: # type: ignore[override] level_0 int64 level_1 int64 dtype: object - >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).get_level_values(0) + >>> pd.MultiIndex.from_arrays( + ... [[1, None, 2], [3, 4, 5]] + ... ).get_level_values(0) Index([1.0, nan, 2.0], dtype='float64') """ level = self._get_level_number(level) @@ -1877,7 +1895,7 @@ def to_frame( Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) + >>> mi = pd.MultiIndex.from_arrays([["a", "b"], ["c", "d"]]) >>> mi MultiIndex([('a', 'c'), ('b', 'd')], @@ -1895,7 +1913,7 @@ def to_frame( 0 a c 1 b d - >>> df = mi.to_frame(name=['x', 'y']) + >>> df = mi.to_frame(name=["x", "y"]) >>> df x y a c a c @@ -1954,8 +1972,8 @@ def to_flat_index(self) -> Index: # type: ignore[override] Examples -------- >>> index = pd.MultiIndex.from_product( - ... [['foo', 'bar'], ['baz', 'qux']], - ... names=['a', 'b']) + ... [["foo", "bar"], ["baz", "qux"]], names=["a", "b"] + ... ) >>> index.to_flat_index() Index([('foo', 'baz'), ('foo', 'qux'), ('bar', 'baz'), ('bar', 'qux')], @@ -1976,25 +1994,33 @@ def _is_lexsorted(self) -> bool: In the below examples, the first level of the MultiIndex is sorted because a>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], - ... ['d', 'e', 'f']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "b", "c"], ["d", "e", "f"]] + ... )._is_lexsorted() True - >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], - ... ['d', 'f', 'e']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "b", "c"], ["d", "f", "e"]] + ... )._is_lexsorted() True In case there is a tie, the lexicographical sorting looks at the next level of the MultiIndex. - >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [[0, 1, 1], ["a", "b", "c"]] + ... )._is_lexsorted() True - >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [[0, 1, 1], ["a", "c", "b"]] + ... )._is_lexsorted() False - >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], - ... ['aa', 'bb', 'aa', 'bb']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "a", "b", "b"], ["aa", "bb", "aa", "bb"]] + ... )._is_lexsorted() True - >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], - ... ['bb', 'aa', 'aa', 'bb']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "a", "b", "b"], ["bb", "aa", "aa", "bb"]] + ... )._is_lexsorted() False """ return self._lexsort_depth == self.nlevels @@ -2031,8 +2057,10 @@ def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIn Examples -------- - >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi = pd.MultiIndex( + ... levels=[["a", "b"], ["bb", "aa"]], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + ... ) >>> mi MultiIndex([('a', 'bb'), ('a', 'aa'), @@ -2095,7 +2123,7 @@ def remove_unused_levels(self) -> MultiIndex: Examples -------- - >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) + >>> mi = pd.MultiIndex.from_product([range(2), list("ab")]) >>> mi MultiIndex([(0, 'a'), (0, 'b'), @@ -2279,7 +2307,7 @@ def append(self, other): Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"]]) >>> mi MultiIndex([('a', 'b')], ) @@ -2374,8 +2402,10 @@ def drop( # type: ignore[override] Examples -------- - >>> idx = pd.MultiIndex.from_product([(0, 1, 2), ('green', 'purple')], - ... names=["number", "color"]) + >>> idx = pd.MultiIndex.from_product( + ... [(0, 1, 2), ("green", "purple")], + ... names=["number", "color"], + ... ) >>> idx MultiIndex([(0, 'green'), (0, 'purple'), @@ -2384,7 +2414,7 @@ def drop( # type: ignore[override] (2, 'green'), (2, 'purple')], names=['number', 'color']) - >>> idx.drop([(1, 'green'), (2, 'purple')]) + >>> idx.drop([(1, "green"), (2, "purple")]) MultiIndex([(0, 'green'), (0, 'purple'), (1, 'purple'), @@ -2393,7 +2423,7 @@ def drop( # type: ignore[override] We can also drop from a specific level. - >>> idx.drop('green', level='color') + >>> idx.drop("green", level="color") MultiIndex([(0, 'purple'), (1, 'purple'), (2, 'purple')], @@ -2492,8 +2522,10 @@ def swaplevel(self, i=-2, j=-1) -> MultiIndex: Examples -------- - >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi = pd.MultiIndex( + ... levels=[["a", "b"], ["bb", "aa"]], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + ... ) >>> mi MultiIndex([('a', 'bb'), ('a', 'aa'), @@ -2538,7 +2570,9 @@ def reorder_levels(self, order) -> MultiIndex: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4]], names=["x", "y"] + ... ) >>> mi MultiIndex([(1, 3), (2, 4)], @@ -2549,7 +2583,7 @@ def reorder_levels(self, order) -> MultiIndex: (4, 2)], names=['y', 'x']) - >>> mi.reorder_levels(order=['y', 'x']) + >>> mi.reorder_levels(order=["y", "x"]) MultiIndex([(3, 1), (4, 2)], names=['y', 'x']) @@ -2673,7 +2707,8 @@ def sortlevel( # error: Item "Hashable" of "Union[Hashable, Sequence[Hashable]]" has # no attribute "__iter__" (not iterable) level = [ - self._get_level_number(lev) for lev in level # type: ignore[union-attr] + self._get_level_number(lev) + for lev in level # type: ignore[union-attr] ] sortorder = None @@ -2823,18 +2858,18 @@ def get_slice_bound( Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + >>> mi = pd.MultiIndex.from_arrays([list("abbc"), list("gefd")]) Get the locations from the leftmost 'b' in the first level until the end of the multiindex: - >>> mi.get_slice_bound('b', side="left") + >>> mi.get_slice_bound("b", side="left") 1 Like above, but if you get the locations from the rightmost 'b' in the first level and 'f' in the second level: - >>> mi.get_slice_bound(('b','f'), side="right") + >>> mi.get_slice_bound(("b", "f"), side="right") 3 See Also @@ -2878,19 +2913,20 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')], - ... names=['A', 'B']) + >>> mi = pd.MultiIndex.from_arrays( + ... [list("abbd"), list("deff")], names=["A", "B"] + ... ) Get the slice locations from the beginning of 'b' in the first level until the end of the multiindex: - >>> mi.slice_locs(start='b') + >>> mi.slice_locs(start="b") (1, 4) Like above, but stop at the end of 'b' in the first level and 'f' in the second level: - >>> mi.slice_locs(start='b', end=('b', 'f')) + >>> mi.slice_locs(start="b", end=("b", "f")) (1, 3) See Also @@ -3012,12 +3048,12 @@ def get_loc(self, key): Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + >>> mi = pd.MultiIndex.from_arrays([list("abb"), list("def")]) - >>> mi.get_loc('b') + >>> mi.get_loc("b") slice(1, 3, None) - >>> mi.get_loc(('b', 'e')) + >>> mi.get_loc(("b", "e")) 1 """ self._check_indexing_error(key) @@ -3130,16 +3166,17 @@ def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True): Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], - ... names=['A', 'B']) + >>> mi = pd.MultiIndex.from_arrays( + ... [list("abb"), list("def")], names=["A", "B"] + ... ) - >>> mi.get_loc_level('b') + >>> mi.get_loc_level("b") (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) - >>> mi.get_loc_level('e', level='B') + >>> mi.get_loc_level("e", level="B") (array([False, True, False]), Index(['b'], dtype='object', name='A')) - >>> mi.get_loc_level(['b', 'e']) + >>> mi.get_loc_level(["b", "e"]) (1, None) """ if not isinstance(level, (list, tuple)): @@ -3441,15 +3478,17 @@ def get_locs(self, seq) -> npt.NDArray[np.intp]: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + >>> mi = pd.MultiIndex.from_arrays([list("abb"), list("def")]) - >>> mi.get_locs('b') # doctest: +SKIP + >>> mi.get_locs("b") # doctest: +SKIP array([1, 2], dtype=int64) - >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP + >>> mi.get_locs([slice(None), ["e", "f"]]) # doctest: +SKIP array([1, 2], dtype=int64) - >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP + >>> mi.get_locs( + ... [[True, False, True], slice("e", "f")] + ... ) # doctest: +SKIP array([2], dtype=int64) """ @@ -3657,11 +3696,13 @@ def truncate(self, before=None, after=None) -> MultiIndex: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z']]) + >>> mi = pd.MultiIndex.from_arrays( + ... [["a", "b", "c"], ["x", "y", "z"]] + ... ) >>> mi MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], ) - >>> mi.truncate(before='a', after='b') + >>> mi.truncate(before="a", after="b") MultiIndex([('a', 'x'), ('b', 'y')], ) """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b2f1933800fd3..bfbc350095770 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -171,7 +171,9 @@ class PeriodIndex(DatetimeIndexOpsMixin): Examples -------- - >>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) + >>> idx = pd.PeriodIndex.from_fields( + ... year=[2000, 2002], quarter=[1, 3] + ... ) >>> idx PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') """ @@ -585,7 +587,9 @@ def period_range( Examples -------- - >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + >>> pd.period_range( + ... start="2017-01-01", end="2018-01-01", freq="M" + ... ) PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01'], @@ -595,8 +599,11 @@ def period_range( endpoints for a ``PeriodIndex`` with frequency matching that of the ``period_range`` constructor. - >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), - ... end=pd.Period('2017Q2', freq='Q'), freq='M') + >>> pd.period_range( + ... start=pd.Period("2017Q1", freq="Q"), + ... end=pd.Period("2017Q2", freq="Q"), + ... freq="M", + ... ) PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], dtype='period[M]') """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 08a265ba47648..af6ae0b27c93f 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -113,13 +113,17 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): Examples -------- - >>> pd.TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days']) + >>> pd.TimedeltaIndex( + ... ["0 days", "1 days", "2 days", "3 days", "4 days"] + ... ) TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) We can also let pandas infer the frequency when possible. - >>> pd.TimedeltaIndex(np.arange(5) * 24 * 3600 * 1e9, freq='infer') + >>> pd.TimedeltaIndex( + ... np.arange(5) * 24 * 3600 * 1e9, freq="infer" + ... ) TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') """ @@ -312,14 +316,14 @@ def timedelta_range( Examples -------- - >>> pd.timedelta_range(start='1 day', periods=4) + >>> pd.timedelta_range(start="1 day", periods=4) TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') The ``closed`` parameter specifies which endpoint is included. The default behavior is to include both endpoints. - >>> pd.timedelta_range(start='1 day', periods=4, closed='right') + >>> pd.timedelta_range(start="1 day", periods=4, closed="right") TimedeltaIndex(['2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') @@ -327,7 +331,7 @@ def timedelta_range( Only fixed frequencies can be passed, non-fixed frequencies such as 'M' (month end) will raise. - >>> pd.timedelta_range(start='1 day', end='2 days', freq='6h') + >>> pd.timedelta_range(start="1 day", end="2 days", freq="6h") TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', '1 days 18:00:00', '2 days 00:00:00'], dtype='timedelta64[ns]', freq='6h') @@ -335,14 +339,16 @@ def timedelta_range( Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). - >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) + >>> pd.timedelta_range(start="1 day", end="5 days", periods=4) TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', '5 days 00:00:00'], dtype='timedelta64[ns]', freq=None) **Specify a unit** - >>> pd.timedelta_range("1 Day", periods=3, freq="100000D", unit="s") + >>> pd.timedelta_range( + ... "1 Day", periods=3, freq="100000D", unit="s" + ... ) TimedeltaIndex(['1 days', '100001 days', '200001 days'], dtype='timedelta64[s]', freq='100000D') """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c233295b25700..c72372a7d4927 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -120,14 +120,19 @@ class _IndexSlice: Examples -------- - >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) - >>> columns = ['foo', 'bar'] - >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), - ... index=midx, columns=columns) + >>> midx = pd.MultiIndex.from_product( + ... [["A0", "A1"], ["B0", "B1", "B2", "B3"]] + ... ) + >>> columns = ["foo", "bar"] + >>> dfmi = pd.DataFrame( + ... np.arange(16).reshape((len(midx), len(columns))), + ... index=midx, + ... columns=columns, + ... ) Using the default slice command: - >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] + >>> dfmi.loc[(slice(None), slice("B0", "B1")), :] foo bar A0 B0 0 1 B1 2 3 @@ -137,7 +142,7 @@ class _IndexSlice: Using the IndexSlice class for a more intuitive command: >>> idx = pd.IndexSlice - >>> dfmi.loc[idx[:, 'B0':'B1'], :] + >>> dfmi.loc[idx[:, "B0":"B1"], :] foo bar A0 B0 0 1 B1 2 3 @@ -199,9 +204,11 @@ def iloc(self) -> _iLocIndexer: Examples -------- - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, - ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] + >>> mydict = [ + ... {"a": 1, "b": 2, "c": 3, "d": 4}, + ... {"a": 100, "b": 200, "c": 300, "d": 400}, + ... {"a": 1000, "b": 2000, "c": 3000, "d": 4000}, + ... ] >>> df = pd.DataFrame(mydict) >>> df a b c d @@ -349,9 +356,11 @@ def loc(self) -> _LocIndexer: -------- **Getting values** - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) + >>> df = pd.DataFrame( + ... [[1, 2], [4, 5], [7, 8]], + ... index=["cobra", "viper", "sidewinder"], + ... columns=["max_speed", "shield"], + ... ) >>> df max_speed shield cobra 1 2 @@ -360,27 +369,27 @@ def loc(self) -> _LocIndexer: Single label. Note this returns the row as a Series. - >>> df.loc['viper'] + >>> df.loc["viper"] max_speed 4 shield 5 Name: viper, dtype: int64 List of labels. Note using ``[[]]`` returns a DataFrame. - >>> df.loc[['viper', 'sidewinder']] + >>> df.loc[["viper", "sidewinder"]] max_speed shield viper 4 5 sidewinder 7 8 Single label for row and column - >>> df.loc['cobra', 'shield'] + >>> df.loc["cobra", "shield"] 2 Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. - >>> df.loc['cobra':'viper', 'max_speed'] + >>> df.loc["cobra":"viper", "max_speed"] cobra 1 viper 4 Name: max_speed, dtype: int64 @@ -393,8 +402,12 @@ def loc(self) -> _LocIndexer: Alignable boolean Series: - >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] + >>> df.loc[ + ... pd.Series( + ... [False, True, False], + ... index=["viper", "sidewinder", "cobra"], + ... ) + ... ] max_speed shield sidewinder 7 8 @@ -408,25 +421,25 @@ def loc(self) -> _LocIndexer: Conditional that returns a boolean Series - >>> df.loc[df['shield'] > 6] + >>> df.loc[df["shield"] > 6] max_speed shield sidewinder 7 8 Conditional that returns a boolean Series with column labels specified - >>> df.loc[df['shield'] > 6, ['max_speed']] + >>> df.loc[df["shield"] > 6, ["max_speed"]] max_speed sidewinder 7 Multiple conditional using ``&`` that returns a boolean Series - >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] + >>> df.loc[(df["max_speed"] > 1) & (df["shield"] < 8)] max_speed shield viper 4 5 Multiple conditional using ``|`` that returns a boolean Series - >>> df.loc[(df['max_speed'] > 4) | (df['shield'] < 5)] + >>> df.loc[(df["max_speed"] > 4) | (df["shield"] < 5)] max_speed shield cobra 1 2 sidewinder 7 8 @@ -443,7 +456,7 @@ def loc(self) -> _LocIndexer: Callable that returns a boolean Series - >>> df.loc[lambda df: df['shield'] == 8] + >>> df.loc[lambda df: df["shield"] == 8] max_speed shield sidewinder 7 8 @@ -451,7 +464,7 @@ def loc(self) -> _LocIndexer: Set value for all items matching the list of labels - >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df.loc[["viper", "sidewinder"], ["shield"]] = 50 >>> df max_speed shield cobra 1 2 @@ -460,7 +473,7 @@ def loc(self) -> _LocIndexer: Set value for an entire row - >>> df.loc['cobra'] = 10 + >>> df.loc["cobra"] = 10 >>> df max_speed shield cobra 10 10 @@ -469,7 +482,7 @@ def loc(self) -> _LocIndexer: Set value for an entire column - >>> df.loc[:, 'max_speed'] = 30 + >>> df.loc[:, "max_speed"] = 30 >>> df max_speed shield cobra 30 10 @@ -478,7 +491,7 @@ def loc(self) -> _LocIndexer: Set value for rows matching callable condition - >>> df.loc[df['shield'] > 35] = 0 + >>> df.loc[df["shield"] > 35] = 0 >>> df max_speed shield cobra 30 10 @@ -509,8 +522,11 @@ def loc(self) -> _LocIndexer: Another example using integers for the index - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df = pd.DataFrame( + ... [[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], + ... columns=["max_speed", "shield"], + ... ) >>> df max_speed shield 7 1 2 @@ -531,14 +547,25 @@ def loc(self) -> _LocIndexer: A number of examples using a DataFrame with a MultiIndex >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ("cobra", "mark i"), + ... ("cobra", "mark ii"), + ... ("sidewinder", "mark i"), + ... ("sidewinder", "mark ii"), + ... ("viper", "mark ii"), + ... ("viper", "mark iii"), ... ] >>> index = pd.MultiIndex.from_tuples(tuples) - >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] - >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> values = [ + ... [12, 2], + ... [0, 4], + ... [10, 20], + ... [1, 4], + ... [7, 1], + ... [16, 36], + ... ] + >>> df = pd.DataFrame( + ... values, columns=["max_speed", "shield"], index=index + ... ) >>> df max_speed shield cobra mark i 12 2 @@ -550,14 +577,14 @@ def loc(self) -> _LocIndexer: Single label. Note this returns a DataFrame with a single index. - >>> df.loc['cobra'] + >>> df.loc["cobra"] max_speed shield mark i 12 2 mark ii 0 4 Single index tuple. Note this returns a Series. - >>> df.loc[('cobra', 'mark ii')] + >>> df.loc[("cobra", "mark ii")] max_speed 0 shield 4 Name: (cobra, mark ii), dtype: int64 @@ -565,25 +592,25 @@ def loc(self) -> _LocIndexer: Single label for row and column. Similar to passing in a tuple, this returns a Series. - >>> df.loc['cobra', 'mark i'] + >>> df.loc["cobra", "mark i"] max_speed 12 shield 2 Name: (cobra, mark i), dtype: int64 Single tuple. Note using ``[[]]`` returns a DataFrame. - >>> df.loc[[('cobra', 'mark ii')]] + >>> df.loc[[("cobra", "mark ii")]] max_speed shield cobra mark ii 0 4 Single tuple for the index with a single label for the column - >>> df.loc[('cobra', 'mark i'), 'shield'] + >>> df.loc[("cobra", "mark i"), "shield"] 2 Slice from index tuple to single label - >>> df.loc[('cobra', 'mark i'):'viper'] + >>> df.loc[("cobra", "mark i") : "viper"] max_speed shield cobra mark i 12 2 mark ii 0 4 @@ -594,7 +621,7 @@ def loc(self) -> _LocIndexer: Slice from index tuple to index tuple - >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + >>> df.loc[("cobra", "mark i") : ("viper", "mark ii")] max_speed shield cobra mark i 12 2 mark ii 0 4 @@ -646,8 +673,11 @@ def at(self) -> _AtIndexer: Examples -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... [[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], + ... columns=["A", "B", "C"], + ... ) >>> df A B C 4 0 2 3 @@ -656,18 +686,18 @@ def at(self) -> _AtIndexer: Get value at specified row/column pair - >>> df.at[4, 'B'] + >>> df.at[4, "B"] 2 Set value at specified row/column pair - >>> df.at[4, 'B'] = 10 - >>> df.at[4, 'B'] + >>> df.at[4, "B"] = 10 + >>> df.at[4, "B"] 10 Get value within a Series - >>> df.loc[5].at['B'] + >>> df.loc[5].at["B"] 4 """ return _AtIndexer("at", self) @@ -694,8 +724,10 @@ def iat(self) -> _iAtIndexer: Examples -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... [[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=["A", "B", "C"], + ... ) >>> df A B C 0 0 2 3 diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index d45ae37890ba7..e62a0997b6eee 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -48,12 +48,17 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: Examples -------- - >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> df_not_necessarily_pandas = pd.DataFrame( + ... {"A": [1, 2], "B": [3, 4]} + ... ) + >>> interchange_object = ( + ... df_not_necessarily_pandas.__dataframe__() + ... ) >>> interchange_object.column_names() Index(['A', 'B'], dtype='object') - >>> df_pandas = (pd.api.interchange.from_dataframe - ... (interchange_object.select_columns_by_name(['A']))) + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) >>> df_pandas A 0 1 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1af2d9e739038..e3f90d2281deb 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1842,7 +1842,8 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: # error: Argument 1 to "np_can_hold_element" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" casted = np_can_hold_element( - self.dtype, fill_value # type: ignore[arg-type] + self.dtype, + fill_value, # type: ignore[arg-type] ) except LossySetitemError: nb = self.coerce_to_target_dtype(fill_value) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b2d463a8c6c26..3d5bd9eb94f50 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -118,7 +118,9 @@ def concatenate_managers( # type "List[BlockManager]"; expected "List[Union[ArrayManager, # SingleArrayManager, BlockManager, SingleBlockManager]]" return _concatenate_array_managers( - mgrs, axes, concat_axis # type: ignore[arg-type] + mgrs, + axes, + concat_axis, # type: ignore[arg-type] ) # Assertions disabled for performance @@ -474,9 +476,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike # error: No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[int, slice]" to_concat = [ - t - if is_1d_only_ea_dtype(t.dtype) - else t[0, :] # type: ignore[call-overload] + t if is_1d_only_ea_dtype(t.dtype) else t[0, :] # type: ignore[call-overload] for t in to_concat ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d275445983b6f..104eaff495299 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1028,7 +1028,9 @@ def _interp_limit( def _interp_limit(invalid, fw_limit, bw_limit): for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + if invalid[ + max(0, x - fw_limit) : x + bw_limit + 1 + ].all(): yield x """ # handle forward first; the backward direction is the same except diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index fc685935a35fc..c6b6a1acdbd81 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -89,9 +89,9 @@ def mask_zero_div_zero(x, y, result: np.ndarray) -> np.ndarray: >>> x = np.array([1, 0, -1], dtype=np.int64) >>> x array([ 1, 0, -1]) - >>> y = 0 # int 0; numpy behavior is different with float + >>> y = 0 # int 0; numpy behavior is different with float >>> result = x // y - >>> result # raw numpy result does not fill division by zero + >>> result # raw numpy result does not fill division by zero array([0, 0, 0]) >>> mask_zero_div_zero(x, y, result) array([ inf, nan, -inf]) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8ed47f16c7abe..6160dc406dcdf 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -377,16 +377,16 @@ def transform(self, arg, *args, **kwargs): Examples -------- - >>> s = pd.Series([1, 2], - ... index=pd.date_range('20180101', - ... periods=2, - ... freq='1h')) + >>> s = pd.Series( + ... [1, 2], + ... index=pd.date_range("20180101", periods=2, freq="1h"), + ... ) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 Freq: h, dtype: int64 - >>> resampled = s.resample('15min') + >>> resampled = s.resample("15min") >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) 2018-01-01 00:00:00 NaN 2018-01-01 01:00:00 NaN @@ -541,8 +541,17 @@ def ffill(self, limit: int | None = None): -------- Here we only create a ``Series``. - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 @@ -552,7 +561,7 @@ def ffill(self, limit: int | None = None): Example for ``ffill`` with downsampling (we have fewer dates after resampling): - >>> ser.resample('MS').ffill() + >>> ser.resample("MS").ffill() 2023-01-01 1 2023-02-01 3 Freq: MS, dtype: int64 @@ -560,7 +569,7 @@ def ffill(self, limit: int | None = None): Example for ``ffill`` with upsampling (fill the new dates with the previous value): - >>> ser.resample('W').ffill() + >>> ser.resample("W").ffill() 2023-01-01 1 2023-01-08 1 2023-01-15 2 @@ -574,7 +583,7 @@ def ffill(self, limit: int | None = None): With upsampling and limiting (only fill the first new date with the previous value): - >>> ser.resample('W').ffill(limit=1) + >>> ser.resample("W").ffill(limit=1) 2023-01-01 1.0 2023-01-08 1.0 2023-01-15 2.0 @@ -619,16 +628,16 @@ def nearest(self, limit: int | None = None): Examples -------- - >>> s = pd.Series([1, 2], - ... index=pd.date_range('20180101', - ... periods=2, - ... freq='1h')) + >>> s = pd.Series( + ... [1, 2], + ... index=pd.date_range("20180101", periods=2, freq="1h"), + ... ) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 Freq: h, dtype: int64 - >>> s.resample('15min').nearest() + >>> s.resample("15min").nearest() 2018-01-01 00:00:00 1 2018-01-01 00:15:00 1 2018-01-01 00:30:00 2 @@ -638,7 +647,7 @@ def nearest(self, limit: int | None = None): Limit the number of upsampled values imputed by the nearest: - >>> s.resample('15min').nearest(limit=1) + >>> s.resample("15min").nearest(limit=1) 2018-01-01 00:00:00 1.0 2018-01-01 00:15:00 1.0 2018-01-01 00:30:00 NaN @@ -690,15 +699,17 @@ def bfill(self, limit: int | None = None): -------- Resampling a Series: - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s = pd.Series( + ... [1, 2, 3], + ... index=pd.date_range("20180101", periods=3, freq="h"), + ... ) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 Freq: h, dtype: int64 - >>> s.resample('30min').bfill() + >>> s.resample("30min").bfill() 2018-01-01 00:00:00 1 2018-01-01 00:30:00 2 2018-01-01 01:00:00 2 @@ -706,7 +717,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 02:00:00 3 Freq: 30min, dtype: int64 - >>> s.resample('15min').bfill(limit=2) + >>> s.resample("15min").bfill(limit=2) 2018-01-01 00:00:00 1.0 2018-01-01 00:15:00 NaN 2018-01-01 00:30:00 2.0 @@ -720,16 +731,17 @@ def bfill(self, limit: int | None = None): Resampling a DataFrame that has missing values: - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) + >>> df = pd.DataFrame( + ... {"a": [2, np.nan, 6], "b": [1, 3, 5]}, + ... index=pd.date_range("20180101", periods=3, freq="h"), + ... ) >>> df a b 2018-01-01 00:00:00 2.0 1 2018-01-01 01:00:00 NaN 3 2018-01-01 02:00:00 6.0 5 - >>> df.resample('30min').bfill() + >>> df.resample("30min").bfill() a b 2018-01-01 00:00:00 2.0 1 2018-01-01 00:30:00 NaN 3 @@ -737,7 +749,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 01:30:00 6.0 5 2018-01-01 02:00:00 6.0 5 - >>> df.resample('15min').bfill(limit=2) + >>> df.resample("15min").bfill(limit=2) a b 2018-01-01 00:00:00 2.0 1.0 2018-01-01 00:15:00 NaN NaN @@ -802,8 +814,10 @@ def fillna(self, method, limit: int | None = None): -------- Resampling a Series: - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s = pd.Series( + ... [1, 2, 3], + ... index=pd.date_range("20180101", periods=3, freq="h"), + ... ) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 @@ -820,7 +834,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 02:00:00 3.0 Freq: 30min, dtype: float64 - >>> s.resample('30min').fillna("backfill") + >>> s.resample("30min").fillna("backfill") 2018-01-01 00:00:00 1 2018-01-01 00:30:00 2 2018-01-01 01:00:00 2 @@ -828,7 +842,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 02:00:00 3 Freq: 30min, dtype: int64 - >>> s.resample('15min').fillna("backfill", limit=2) + >>> s.resample("15min").fillna("backfill", limit=2) 2018-01-01 00:00:00 1.0 2018-01-01 00:15:00 NaN 2018-01-01 00:30:00 2.0 @@ -840,7 +854,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 02:00:00 3.0 Freq: 15min, dtype: float64 - >>> s.resample('30min').fillna("pad") + >>> s.resample("30min").fillna("pad") 2018-01-01 00:00:00 1 2018-01-01 00:30:00 1 2018-01-01 01:00:00 2 @@ -848,7 +862,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 02:00:00 3 Freq: 30min, dtype: int64 - >>> s.resample('30min').fillna("nearest") + >>> s.resample("30min").fillna("nearest") 2018-01-01 00:00:00 1 2018-01-01 00:30:00 2 2018-01-01 01:00:00 2 @@ -858,15 +872,17 @@ def fillna(self, method, limit: int | None = None): Missing values present before the upsampling are not affected. - >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> sm = pd.Series( + ... [1, None, 3], + ... index=pd.date_range("20180101", periods=3, freq="h"), + ... ) >>> sm 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN 2018-01-01 02:00:00 3.0 Freq: h, dtype: float64 - >>> sm.resample('30min').fillna('backfill') + >>> sm.resample("30min").fillna("backfill") 2018-01-01 00:00:00 1.0 2018-01-01 00:30:00 NaN 2018-01-01 01:00:00 NaN @@ -874,7 +890,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 02:00:00 3.0 Freq: 30min, dtype: float64 - >>> sm.resample('30min').fillna('pad') + >>> sm.resample("30min").fillna("pad") 2018-01-01 00:00:00 1.0 2018-01-01 00:30:00 1.0 2018-01-01 01:00:00 NaN @@ -882,7 +898,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 02:00:00 3.0 Freq: 30min, dtype: float64 - >>> sm.resample('30min').fillna('nearest') + >>> sm.resample("30min").fillna("nearest") 2018-01-01 00:00:00 1.0 2018-01-01 00:30:00 NaN 2018-01-01 01:00:00 NaN @@ -893,16 +909,17 @@ def fillna(self, method, limit: int | None = None): DataFrame resampling is done column-wise. All the same options are available. - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) + >>> df = pd.DataFrame( + ... {"a": [2, np.nan, 6], "b": [1, 3, 5]}, + ... index=pd.date_range("20180101", periods=3, freq="h"), + ... ) >>> df a b 2018-01-01 00:00:00 2.0 1 2018-01-01 01:00:00 NaN 3 2018-01-01 02:00:00 6.0 5 - >>> df.resample('30min').fillna("bfill") + >>> df.resample("30min").fillna("bfill") a b 2018-01-01 00:00:00 2.0 1 2018-01-01 00:30:00 NaN 3 @@ -1120,15 +1137,24 @@ def asfreq(self, fill_value=None): Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-31', '2023-02-01', '2023-02-28'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-31", + ... "2023-02-01", + ... "2023-02-28", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-31 2 2023-02-01 3 2023-02-28 4 dtype: int64 - >>> ser.resample('MS').asfreq() + >>> ser.resample("MS").asfreq() 2023-01-01 1 2023-02-01 3 Freq: MS, dtype: int64 @@ -1166,15 +1192,24 @@ def sum( Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').sum() + >>> ser.resample("MS").sum() 2023-01-01 3 2023-02-01 7 Freq: MS, dtype: int64 @@ -1214,15 +1249,24 @@ def prod( Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').prod() + >>> ser.resample("MS").prod() 2023-01-01 2 2023-02-01 12 Freq: MS, dtype: int64 @@ -1248,15 +1292,24 @@ def min( Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').min() + >>> ser.resample("MS").min() 2023-01-01 1 2023-02-01 3 Freq: MS, dtype: int64 @@ -1283,15 +1336,24 @@ def max( Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').max() + >>> ser.resample("MS").max() 2023-01-01 2 2023-02-01 4 Freq: MS, dtype: int64 @@ -1360,15 +1422,24 @@ def mean( Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-15", + ... ] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').mean() + >>> ser.resample("MS").mean() 2023-01-01 1.5 2023-02-01 3.5 Freq: MS, dtype: float64 @@ -1409,14 +1480,20 @@ def std( Examples -------- - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').std() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").std() 2023-01-01 1.000000 2023-02-01 2.645751 Freq: MS, dtype: float64 @@ -1458,19 +1535,25 @@ def var( Examples -------- - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').var() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").var() 2023-01-01 1.0 2023-02-01 7.0 Freq: MS, dtype: float64 - >>> ser.resample('MS').var(ddof=0) + >>> ser.resample("MS").var(ddof=0) 2023-01-01 0.666667 2023-02-01 4.666667 Freq: MS, dtype: float64 @@ -1597,19 +1680,25 @@ def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs): Examples -------- - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').quantile() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").quantile() 2023-01-01 2.0 2023-02-01 4.0 Freq: MS, dtype: float64 - >>> ser.resample('MS').quantile(.25) + >>> ser.resample("MS").quantile(0.25) 2023-01-01 1.5 2023-02-01 3.5 Freq: MS, dtype: float64 diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d46348fff7a02..9d654a1a817d3 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -238,8 +238,8 @@ def concat( -------- Combine two ``Series``. - >>> s1 = pd.Series(['a', 'b']) - >>> s2 = pd.Series(['c', 'd']) + >>> s1 = pd.Series(["a", "b"]) + >>> s2 = pd.Series(["c", "d"]) >>> pd.concat([s1, s2]) 0 a 1 b @@ -260,7 +260,7 @@ def concat( Add a hierarchical index at the outermost level of the data with the ``keys`` option. - >>> pd.concat([s1, s2], keys=['s1', 's2']) + >>> pd.concat([s1, s2], keys=["s1", "s2"]) s1 0 a 1 b s2 0 c @@ -269,8 +269,11 @@ def concat( Label the index keys you create with the ``names`` option. - >>> pd.concat([s1, s2], keys=['s1', 's2'], - ... names=['Series name', 'Row ID']) + >>> pd.concat( + ... [s1, s2], + ... keys=["s1", "s2"], + ... names=["Series name", "Row ID"], + ... ) Series name Row ID s1 0 a 1 b @@ -280,14 +283,16 @@ def concat( Combine two ``DataFrame`` objects with identical columns. - >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) + >>> df1 = pd.DataFrame( + ... [["a", 1], ["b", 2]], columns=["letter", "number"] + ... ) >>> df1 letter number 0 a 1 1 b 2 - >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) + >>> df2 = pd.DataFrame( + ... [["c", 3], ["d", 4]], columns=["letter", "number"] + ... ) >>> df2 letter number 0 c 3 @@ -303,8 +308,10 @@ def concat( and return everything. Columns outside the intersection will be filled with ``NaN`` values. - >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) + >>> df3 = pd.DataFrame( + ... [["c", 3, "cat"], ["d", 4, "dog"]], + ... columns=["letter", "number", "animal"], + ... ) >>> df3 letter number animal 0 c 3 cat @@ -330,8 +337,10 @@ def concat( Combine ``DataFrame`` objects horizontally along the x axis by passing in ``axis=1``. - >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], - ... columns=['animal', 'name']) + >>> df4 = pd.DataFrame( + ... [["bird", "polly"], ["monkey", "george"]], + ... columns=["animal", "name"], + ... ) >>> pd.concat([df1, df4], axis=1) letter number animal name 0 a 1 bird polly @@ -340,11 +349,11 @@ def concat( Prevent the result from including duplicate index values with the ``verify_integrity`` option. - >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 = pd.DataFrame([1], index=["a"]) >>> df5 0 a 1 - >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 = pd.DataFrame([2], index=["a"]) >>> df6 0 a 2 @@ -355,11 +364,11 @@ def concat( Append a single row to the end of a ``DataFrame`` object. - >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + >>> df7 = pd.DataFrame({"a": 1, "b": 2}, index=[0]) >>> df7 a b 0 1 2 - >>> new_row = pd.Series({'a': 3, 'b': 4}) + >>> new_row = pd.Series({"a": 3, "b": 4}) >>> new_row a 3 b 4 diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3ed67bb7b7c02..c1c34cc10068b 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -101,7 +101,7 @@ def get_dummies( Examples -------- - >>> s = pd.Series(list('abca')) + >>> s = pd.Series(list("abca")) >>> pd.get_dummies(s) a b c @@ -110,7 +110,7 @@ def get_dummies( 2 False False True 3 True False False - >>> s1 = ['a', 'b', np.nan] + >>> s1 = ["a", "b", np.nan] >>> pd.get_dummies(s1) a b @@ -124,16 +124,21 @@ def get_dummies( 1 False True False 2 False False True - >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], - ... 'C': [1, 2, 3]}) + >>> df = pd.DataFrame( + ... { + ... "A": ["a", "b", "a"], + ... "B": ["b", "a", "c"], + ... "C": [1, 2, 3], + ... } + ... ) - >>> pd.get_dummies(df, prefix=['col1', 'col2']) + >>> pd.get_dummies(df, prefix=["col1", "col2"]) C col1_a col1_b col2_a col2_b col2_c 0 1 True False False True False 1 2 False True True False False 2 3 True False False False True - >>> pd.get_dummies(pd.Series(list('abcaa'))) + >>> pd.get_dummies(pd.Series(list("abcaa"))) a b c 0 True False False 1 False True False @@ -141,7 +146,7 @@ def get_dummies( 3 True False False 4 True False False - >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + >>> pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) b c 0 False False 1 True False @@ -149,7 +154,7 @@ def get_dummies( 3 False False 4 False False - >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) + >>> pd.get_dummies(pd.Series(list("abc")), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 @@ -426,8 +431,13 @@ def from_dummies( Examples -------- - >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], - ... "c": [0, 0, 1, 0]}) + >>> df = pd.DataFrame( + ... { + ... "a": [1, 0, 0, 1], + ... "b": [0, 1, 0, 0], + ... "c": [0, 0, 1, 0], + ... } + ... ) >>> df a b c @@ -442,9 +452,15 @@ def from_dummies( 2 c 3 a - >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], - ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - ... "col2_c": [0, 0, 1]}) + >>> df = pd.DataFrame( + ... { + ... "col1_a": [1, 0, 1], + ... "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], + ... "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1], + ... } + ... ) >>> df col1_a col1_b col2_a col2_b col2_c @@ -458,9 +474,15 @@ def from_dummies( 1 b a 2 a c - >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], - ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - ... "col2_c": [0, 0, 0]}) + >>> df = pd.DataFrame( + ... { + ... "col1_a": [1, 0, 0], + ... "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], + ... "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 0], + ... } + ... ) >>> df col1_a col1_b col2_a col2_b col2_c @@ -468,7 +490,9 @@ def from_dummies( 1 0 1 1 0 0 2 0 0 0 0 0 - >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) + >>> pd.from_dummies( + ... df, sep="_", default_category={"col1": "d", "col2": "e"} + ... ) col1 col2 0 a b 1 b a diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..8844d1c795303 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -176,15 +176,23 @@ def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: Examples -------- - >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], - ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) + >>> data = pd.DataFrame( + ... { + ... "hr1": [514, 573], + ... "hr2": [545, 526], + ... "team": ["Red Sox", "Yankees"], + ... "year1": [2007, 2007], + ... "year2": [2008, 2008], + ... } + ... ) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 - >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) + >>> pd.lreshape( + ... data, {"year": ["year1", "year2"], "hr": ["hr1", "hr2"]} + ... ) team year hr 0 Red Sox 2007 514 1 Yankees 2007 573 @@ -290,12 +298,15 @@ def wide_to_long( Examples -------- >>> np.random.seed(123) - >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, - ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, - ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, - ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, - ... "X" : dict(zip(range(3), np.random.randn(3))) - ... }) + >>> df = pd.DataFrame( + ... { + ... "A1970": {0: "a", 1: "b", 2: "c"}, + ... "A1980": {0: "d", 1: "e", 2: "f"}, + ... "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + ... "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + ... "X": dict(zip(range(3), np.random.randn(3))), + ... } + ... ) >>> df["id"] = df.index >>> df A1970 A1980 B1970 B1980 X id @@ -315,12 +326,24 @@ def wide_to_long( With multiple id columns - >>> df = pd.DataFrame({ - ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - ... }) + >>> df = pd.DataFrame( + ... { + ... "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... "ht2": [ + ... 3.4, + ... 3.8, + ... 2.9, + ... 3.2, + ... 2.8, + ... 2.4, + ... 3.3, + ... 3.4, + ... 2.9, + ... ], + ... } + ... ) >>> df famid birth ht1 ht2 0 1 1 2.8 3.4 @@ -332,7 +355,9 @@ def wide_to_long( 6 3 1 2.2 3.3 7 3 2 2.3 3.4 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l = pd.wide_to_long( + ... df, stubnames="ht", i=["famid", "birth"], j="age" + ... ) >>> l ... # doctest: +NORMALIZE_WHITESPACE ht @@ -359,7 +384,7 @@ def wide_to_long( Going from long back to wide just takes some creative use of `unstack` >>> w = l.unstack() - >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) + >>> w.columns = w.columns.map("{0[0]}{0[1]}".format) >>> w.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 @@ -375,20 +400,29 @@ def wide_to_long( Less wieldy column names are also handled >>> np.random.seed(0) - >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), - ... 'A(weekly)-2011': np.random.rand(3), - ... 'B(weekly)-2010': np.random.rand(3), - ... 'B(weekly)-2011': np.random.rand(3), - ... 'X' : np.random.randint(3, size=3)}) - >>> df['id'] = df.index - >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + >>> df = pd.DataFrame( + ... { + ... "A(weekly)-2010": np.random.rand(3), + ... "A(weekly)-2011": np.random.rand(3), + ... "B(weekly)-2010": np.random.rand(3), + ... "B(weekly)-2011": np.random.rand(3), + ... "X": np.random.randint(3, size=3), + ... } + ... ) + >>> df["id"] = df.index + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 0 0.548814 0.544883 0.437587 0.383442 0 0 1 0.715189 0.423655 0.891773 0.791725 1 1 2 0.602763 0.645894 0.963663 0.528895 1 2 - >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', - ... j='year', sep='-') + >>> pd.wide_to_long( + ... df, + ... ["A(weekly)", "B(weekly)"], + ... i="id", + ... j="year", + ... sep="-", + ... ) ... # doctest: +NORMALIZE_WHITESPACE X A(weekly) B(weekly) id year @@ -403,8 +437,15 @@ def wide_to_long( stubnames and pass that list on to wide_to_long >>> stubnames = sorted( - ... set([match[0] for match in df.columns.str.findall( - ... r'[A-B]\(.*\)').values if match != []]) + ... set( + ... [ + ... match[0] + ... for match in df.columns.str.findall( + ... r"[A-B]\(.*\)" + ... ).values + ... if match != [] + ... ] + ... ) ... ) >>> list(stubnames) ['A(weekly)', 'B(weekly)'] @@ -412,12 +453,34 @@ def wide_to_long( All of the above examples have integers as suffixes. It is possible to have non-integers as suffixes. - >>> df = pd.DataFrame({ - ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - ... }) + >>> df = pd.DataFrame( + ... { + ... "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... "ht_one": [ + ... 2.8, + ... 2.9, + ... 2.2, + ... 2, + ... 1.8, + ... 1.9, + ... 2.2, + ... 2.3, + ... 2.1, + ... ], + ... "ht_two": [ + ... 3.4, + ... 3.8, + ... 2.9, + ... 3.2, + ... 2.8, + ... 2.4, + ... 3.3, + ... 3.4, + ... 2.9, + ... ], + ... } + ... ) >>> df famid birth ht_one ht_two 0 1 1 2.8 3.4 @@ -430,8 +493,14 @@ def wide_to_long( 7 3 2 2.3 3.4 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', - ... sep='_', suffix=r'\w+') + >>> l = pd.wide_to_long( + ... df, + ... stubnames="ht", + ... i=["famid", "birth"], + ... j="age", + ... sep="_", + ... suffix=r"\w+", + ... ) >>> l ... # doctest: +NORMALIZE_WHITESPACE ht diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9d3d1b25a941e..8a8e7cfccfe50 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -367,7 +367,7 @@ def merge_ordered( ... { ... "key": ["a", "c", "e", "a", "c", "e"], ... "lvalue": [1, 2, 3, 1, 2, 3], - ... "group": ["a", "a", "a", "b", "b", "b"] + ... "group": ["a", "a", "a", "b", "b", "b"], ... } ... ) >>> df1 @@ -379,14 +379,18 @@ def merge_ordered( 4 c 2 b 5 e 3 b - >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + >>> df2 = pd.DataFrame( + ... {"key": ["b", "c", "d"], "rvalue": [1, 2, 3]} + ... ) >>> df2 key rvalue 0 b 1 1 c 2 2 d 3 - >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") + >>> merge_ordered( + ... df1, df2, fill_method="ffill", left_by="group" + ... ) key lvalue group rvalue 0 a 1 a NaN 1 b 1 a 1.0 @@ -522,14 +526,18 @@ def merge_asof( Examples -------- - >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + >>> left = pd.DataFrame( + ... {"a": [1, 5, 10], "left_val": ["a", "b", "c"]} + ... ) >>> left a left_val 0 1 a 1 5 b 2 10 c - >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) + >>> right = pd.DataFrame( + ... {"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]} + ... ) >>> right a right_val 0 1 1 @@ -544,7 +552,9 @@ def merge_asof( 1 5 b 3 2 10 c 7 - >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) + >>> pd.merge_asof( + ... left, right, on="a", allow_exact_matches=False + ... ) a left_val right_val 0 1 a NaN 1 5 b 3.0 @@ -564,14 +574,18 @@ def merge_asof( We can use indexed DataFrames as well. - >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) + >>> left = pd.DataFrame( + ... {"left_val": ["a", "b", "c"]}, index=[1, 5, 10] + ... ) >>> left left_val 1 a 5 b 10 c - >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) + >>> right = pd.DataFrame( + ... {"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7] + ... ) >>> right right_val 1 1 @@ -580,7 +594,9 @@ def merge_asof( 6 6 7 7 - >>> pd.merge_asof(left, right, left_index=True, right_index=True) + >>> pd.merge_asof( + ... left, right, left_index=True, right_index=True + ... ) left_val right_val 1 a 1 5 b 3 @@ -598,20 +614,38 @@ def merge_asof( ... pd.Timestamp("2016-05-25 13:30:00.048"), ... pd.Timestamp("2016-05-25 13:30:00.049"), ... pd.Timestamp("2016-05-25 13:30:00.072"), - ... pd.Timestamp("2016-05-25 13:30:00.075") + ... pd.Timestamp("2016-05-25 13:30:00.075"), ... ], ... "ticker": [ - ... "GOOG", - ... "MSFT", - ... "MSFT", - ... "MSFT", - ... "GOOG", - ... "AAPL", - ... "GOOG", - ... "MSFT" - ... ], - ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT", + ... ], + ... "bid": [ + ... 720.50, + ... 51.95, + ... 51.97, + ... 51.99, + ... 720.50, + ... 97.99, + ... 720.50, + ... 52.01, + ... ], + ... "ask": [ + ... 720.93, + ... 51.96, + ... 51.98, + ... 52.00, + ... 720.93, + ... 98.01, + ... 720.88, + ... 52.03, + ... ], ... } ... ) >>> quotes @@ -626,19 +660,19 @@ def merge_asof( 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 >>> trades = pd.DataFrame( - ... { - ... "time": [ - ... pd.Timestamp("2016-05-25 13:30:00.023"), - ... pd.Timestamp("2016-05-25 13:30:00.038"), - ... pd.Timestamp("2016-05-25 13:30:00.048"), - ... pd.Timestamp("2016-05-25 13:30:00.048"), - ... pd.Timestamp("2016-05-25 13:30:00.048") - ... ], - ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], - ... "quantity": [75, 155, 100, 100, 100] - ... } - ... ) + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100], + ... } + ... ) >>> trades time ticker price quantity 0 2016-05-25 13:30:00.023 MSFT 51.95 75 @@ -660,7 +694,11 @@ def merge_asof( We only asof within 2ms between the quote time and the trade time >>> pd.merge_asof( - ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ... trades, + ... quotes, + ... on="time", + ... by="ticker", + ... tolerance=pd.Timedelta("2ms"), ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 @@ -679,7 +717,7 @@ def merge_asof( ... on="time", ... by="ticker", ... tolerance=pd.Timedelta("10ms"), - ... allow_exact_matches=False + ... allow_exact_matches=False, ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 82718d4c43a65..bfa260c0fba00 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -535,7 +535,8 @@ def pivot( # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") # error: Unsupported left operand type for + ("ExtensionArray") indexed = data.set_index( - cols + columns_listlike, append=append # type: ignore[operator] + cols + columns_listlike, + append=append, # type: ignore[operator] ) else: index_list: list[Index] | list[Series] @@ -649,14 +650,55 @@ def crosstab( Examples -------- - >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", - ... "bar", "bar", "foo", "foo", "foo"], dtype=object) - >>> b = np.array(["one", "one", "one", "two", "one", "one", - ... "one", "two", "two", "two", "one"], dtype=object) - >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", - ... "shiny", "dull", "shiny", "shiny", "shiny"], - ... dtype=object) - >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + >>> a = np.array( + ... [ + ... "foo", + ... "foo", + ... "foo", + ... "foo", + ... "bar", + ... "bar", + ... "bar", + ... "bar", + ... "foo", + ... "foo", + ... "foo", + ... ], + ... dtype=object, + ... ) + >>> b = np.array( + ... [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... "one", + ... ], + ... dtype=object, + ... ) + >>> c = np.array( + ... [ + ... "dull", + ... "dull", + ... "shiny", + ... "dull", + ... "dull", + ... "shiny", + ... "shiny", + ... "dull", + ... "shiny", + ... "shiny", + ... "shiny", + ... ], + ... dtype=object, + ... ) + >>> pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) b one two c dull shiny dull shiny a @@ -667,8 +709,8 @@ def crosstab( shown in the output because dropna is True by default. Set dropna=False to preserve categories with no data. - >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) - >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) + >>> bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) >>> pd.crosstab(foo, bar) col_0 d e row_0 diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8c822ec58e011..e55d2a4c46f71 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -82,8 +82,9 @@ class _Unstacker: Examples -------- - >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), - ... ('two', 'a'), ('two', 'b')]) + >>> index = pd.MultiIndex.from_tuples( + ... [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")] + ... ) >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) >>> s one a 1 @@ -867,7 +868,7 @@ def _reorder_for_extension_array_stack( Examples -------- - >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> arr = np.array(["a", "b", "c", "d", "e", "f"]) >>> _reorder_for_extension_array_stack(arr, 2, 3) array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), - ... 3, labels=["bad", "medium", "good"]) + >>> pd.cut( + ... np.array([1, 7, 5, 4, 6, 3]), + ... 3, + ... labels=["bad", "medium", "good"], + ... ) ['bad', 'good', 'medium', 'medium', 'good', 'bad'] Categories (3, object): ['bad' < 'medium' < 'good'] ``ordered=False`` will result in unordered categories when labels are passed. This parameter can be used to allow non-unique labels: - >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, - ... labels=["B", "A", "B"], ordered=False) + >>> pd.cut( + ... np.array([1, 7, 5, 4, 6, 3]), + ... 3, + ... labels=["B", "A", "B"], + ... ordered=False, + ... ) ['B', 'B', 'A', 'A', 'B', 'B'] Categories (2, object): ['A', 'B'] @@ -186,8 +193,10 @@ def cut( Passing a Series as an input returns a Series with categorical dtype: - >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), - ... index=['a', 'b', 'c', 'd', 'e']) + >>> s = pd.Series( + ... np.array([2, 4, 6, 8, 10]), + ... index=["a", "b", "c", "d", "e"], + ... ) >>> pd.cut(s, 3) ... # doctest: +ELLIPSIS a (1.992, 4.667] @@ -201,9 +210,17 @@ def cut( Passing a Series as an input returns a Series with mapping value. It is used to map numerically to intervals based on bins. - >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), - ... index=['a', 'b', 'c', 'd', 'e']) - >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) + >>> s = pd.Series( + ... np.array([2, 4, 6, 8, 10]), + ... index=["a", "b", "c", "d", "e"], + ... ) + >>> pd.cut( + ... s, + ... [0, 2, 4, 6, 8, 10], + ... labels=False, + ... retbins=True, + ... right=False, + ... ) ... # doctest: +ELLIPSIS (a 1.0 b 2.0 @@ -215,8 +232,14 @@ def cut( Use `drop` optional when bins is not unique - >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, - ... right=False, duplicates='drop') + >>> pd.cut( + ... s, + ... [0, 2, 4, 6, 10, 10], + ... labels=False, + ... retbins=True, + ... right=False, + ... duplicates="drop", + ... ) ... # doctest: +ELLIPSIS (a 1.0 b 2.0 @@ -231,7 +254,9 @@ def cut( is to the left of the first bin (which is closed on the right), and 1.5 falls between two bins. - >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + >>> bins = pd.IntervalIndex.from_tuples( + ... [(0, 1), (2, 3), (4, 5)] + ... ) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index bcd51e095a1a1..1a058647da9d6 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -25,7 +25,7 @@ def cartesian_product(X) -> list[np.ndarray]: Examples -------- - >>> cartesian_product([list('ABC'), [1, 2]]) + >>> cartesian_product([list("ABC"), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='>> d = {'a': 1, 'b': 2, 'c': 3} - >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) + >>> d = {"a": 1, "b": 2, "c": 3} + >>> ser = pd.Series(data=d, index=["a", "b", "c"]) >>> ser a 1 b 2 @@ -305,8 +305,8 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] The keys of the dictionary match with the Index values, hence the Index values have no effect. - >>> d = {'a': 1, 'b': 2, 'c': 3} - >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) + >>> d = {"a": 1, "b": 2, "c": 3} + >>> ser = pd.Series(data=d, index=["x", "y", "z"]) >>> ser x NaN y NaN @@ -727,7 +727,7 @@ def name(self) -> Hashable: -------- The Series name can be set initially when calling the constructor. - >>> s = pd.Series([1, 2, 3], dtype=np.int64, name='Numbers') + >>> s = pd.Series([1, 2, 3], dtype=np.int64, name="Numbers") >>> s 0 1 1 2 @@ -742,8 +742,10 @@ def name(self) -> Hashable: The name of a Series within a DataFrame is its column name. - >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], - ... columns=["Odd Numbers", "Even Numbers"]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4], [5, 6]], + ... columns=["Odd Numbers", "Even Numbers"], + ... ) >>> df Odd Numbers Even Numbers 0 1 2 @@ -784,17 +786,18 @@ def values(self): >>> pd.Series([1, 2, 3]).values array([1, 2, 3]) - >>> pd.Series(list('aabc')).values + >>> pd.Series(list("aabc")).values array(['a', 'a', 'b', 'c'], dtype=object) - >>> pd.Series(list('aabc')).astype('category').values + >>> pd.Series(list("aabc")).astype("category").values ['a', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] Timezone aware datetime data is converted to UTC: - >>> pd.Series(pd.date_range('20130101', periods=3, - ... tz='US/Eastern')).values + >>> pd.Series( + ... pd.date_range("20130101", periods=3, tz="US/Eastern") + ... ).values array(['2013-01-01T05:00:00.000000000', '2013-01-02T05:00:00.000000000', '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') @@ -982,7 +985,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: For timezone-aware data, the timezones may be retained with ``dtype='object'`` - >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> tzser = pd.Series( + ... pd.date_range("2000", periods=2, tz="CET") + ... ) >>> np.asarray(tzser, dtype="object") array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), Timestamp('2000-01-02 00:00:00+0100', tz='CET')], @@ -991,7 +996,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: Or the values may be localized to UTC and the tzinfo discarded with ``dtype='datetime64[ns]'`` - >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS + >>> np.asarray( + ... tzser, dtype="datetime64[ns]" + ... ) # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ @@ -1523,7 +1530,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: Examples -------- - >>> s = pd.Series(['a', 'b', 'c']) + >>> s = pd.Series(["a", "b", "c"]) >>> s 0 a 1 b @@ -1639,8 +1646,11 @@ def reset_index( Examples -------- - >>> s = pd.Series([1, 2, 3, 4], name='foo', - ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... name="foo", + ... index=pd.Index(["a", "b", "c", "d"], name="idx"), + ... ) Generate a DataFrame with default index. @@ -1653,7 +1663,7 @@ def reset_index( To specify the name of the new column use `name`. - >>> s.reset_index(name='values') + >>> s.reset_index(name="values") idx values 0 a 1 1 b 2 @@ -1672,16 +1682,21 @@ def reset_index( The `level` parameter is interesting for Series with a multi-level index. - >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']), - ... np.array(['one', 'two', 'one', 'two'])] + >>> arrays = [ + ... np.array(["bar", "bar", "baz", "baz"]), + ... np.array(["one", "two", "one", "two"]), + ... ] >>> s2 = pd.Series( - ... range(4), name='foo', - ... index=pd.MultiIndex.from_arrays(arrays, - ... names=['a', 'b'])) + ... range(4), + ... name="foo", + ... index=pd.MultiIndex.from_arrays( + ... arrays, names=["a", "b"] + ... ), + ... ) To remove a specific level from the Index, use `level`. - >>> s2.reset_index(level='a') + >>> s2.reset_index(level="a") a foo b one bar 0 @@ -1961,7 +1976,7 @@ def items(self) -> Iterable[tuple[Hashable, Any]]: Examples -------- - >>> s = pd.Series(['A', 'B', 'C']) + >>> s = pd.Series(["A", "B", "C"]) >>> for index, value in s.items(): ... print(f"Index : {index}, Value : {value}") Index : 0, Value : A @@ -2007,8 +2022,7 @@ def to_dict(self, *, into: type[dict] = ...) -> dict: ) def to_dict( self, - into: type[MutableMappingT] - | MutableMappingT = dict, # type: ignore[assignment] + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] ) -> MutableMappingT: """ Convert Series to {label -> value} dict or dict-like object. @@ -2065,8 +2079,7 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: Examples -------- - >>> s = pd.Series(["a", "b", "c"], - ... name="vals") + >>> s = pd.Series(["a", "b", "c"], name="vals") >>> s.to_frame() vals 0 a @@ -2347,16 +2360,22 @@ def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation Examples -------- - >>> pd.Series([2, 1, 3, 3], name='A').unique() + >>> pd.Series([2, 1, 3, 3], name="A").unique() array([2, 1, 3]) - >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique() + >>> pd.Series( + ... [pd.Timestamp("2016-01-01") for _ in range(3)] + ... ).unique() ['2016-01-01 00:00:00'] Length: 1, dtype: datetime64[ns] - >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern') - ... for _ in range(3)]).unique() + >>> pd.Series( + ... [ + ... pd.Timestamp("2016-01-01", tz="US/Eastern") + ... for _ in range(3) + ... ] + ... ).unique() ['2016-01-01 00:00:00-05:00'] Length: 1, dtype: datetime64[ns, US/Eastern] @@ -2364,11 +2383,14 @@ def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation An Categorical will return categories in the order of appearance and with the same dtype. - >>> pd.Series(pd.Categorical(list('baabc'))).unique() + >>> pd.Series(pd.Categorical(list("baabc"))).unique() ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] - >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), - ... ordered=True)).unique() + >>> pd.Series( + ... pd.Categorical( + ... list("baabc"), categories=list("abc"), ordered=True + ... ) + ... ).unique() ['b', 'a', 'c'] Categories (3, object): ['a' < 'b' < 'c'] """ @@ -2440,8 +2462,10 @@ def drop_duplicates( -------- Generate a Series with duplicated entries. - >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], - ... name='animal') + >>> s = pd.Series( + ... ["llama", "cow", "llama", "beetle", "llama", "hippo"], + ... name="animal", + ... ) >>> s 0 llama 1 cow @@ -2465,7 +2489,7 @@ def drop_duplicates( The value 'last' for parameter 'keep' keeps the last occurrence for each set of duplicated entries. - >>> s.drop_duplicates(keep='last') + >>> s.drop_duplicates(keep="last") 1 cow 3 beetle 4 llama @@ -2529,7 +2553,9 @@ def duplicated(self, keep: DropKeep = "first") -> Series: By default, for each set of duplicated values, the first occurrence is set on False and all others on True: - >>> animals = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama']) + >>> animals = pd.Series( + ... ["llama", "cow", "llama", "beetle", "llama"] + ... ) >>> animals.duplicated() 0 False 1 False @@ -2540,7 +2566,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: which is equivalent to - >>> animals.duplicated(keep='first') + >>> animals.duplicated(keep="first") 0 False 1 False 2 True @@ -2551,7 +2577,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True: - >>> animals.duplicated(keep='last') + >>> animals.duplicated(keep="last") 0 True 1 False 2 True @@ -2618,8 +2644,9 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab Examples -------- - >>> s = pd.Series(data=[1, None, 4, 1], - ... index=['A', 'B', 'C', 'D']) + >>> s = pd.Series( + ... data=[1, None, 4, 1], index=["A", "B", "C", "D"] + ... ) >>> s A 1.0 B NaN @@ -2701,8 +2728,9 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab Examples -------- - >>> s = pd.Series(data=[1, None, 4, 3, 4], - ... index=['A', 'B', 'C', 'D', 'E']) + >>> s = pd.Series( + ... data=[1, None, 4, 3, 4], index=["A", "B", "C", "D", "E"] + ... ) >>> s A 1.0 B NaN @@ -2840,9 +2868,9 @@ def quantile( Examples -------- >>> s = pd.Series([1, 2, 3, 4]) - >>> s.quantile(.5) + >>> s.quantile(0.5) 2.5 - >>> s.quantile([.25, .5, .75]) + >>> s.quantile([0.25, 0.5, 0.75]) 0.25 1.75 0.50 2.50 0.75 3.25 @@ -2924,8 +2952,8 @@ def corr( >>> def histogram_intersection(a, b): ... v = np.minimum(a, b).sum().round(decimals=1) ... return v - >>> s1 = pd.Series([.2, .0, .6, .2]) - >>> s2 = pd.Series([.3, .6, .0, .1]) + >>> s1 = pd.Series([0.2, 0.0, 0.6, 0.2]) + >>> s2 = pd.Series([0.3, 0.6, 0.0, 0.1]) >>> s1.corr(s2, method=histogram_intersection) 0.3 @@ -3379,12 +3407,14 @@ def combine( Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. - >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) + >>> s1 = pd.Series({"falcon": 330.0, "eagle": 160.0}) >>> s1 falcon 330.0 eagle 160.0 dtype: float64 - >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + >>> s2 = pd.Series( + ... {"falcon": 345.0, "eagle": 200.0, "duck": 30.0} + ... ) >>> s2 falcon 345.0 eagle 200.0 @@ -3480,8 +3510,8 @@ def combine_first(self, other) -> Series: Null values still persist if the location of that null value does not exist in `other` - >>> s1 = pd.Series({'falcon': np.nan, 'eagle': 160.0}) - >>> s2 = pd.Series({'eagle': 200.0, 'duck': 30.0}) + >>> s1 = pd.Series({"falcon": np.nan, "eagle": 160.0}) + >>> s2 = pd.Series({"eagle": 200.0, "duck": 30.0}) >>> s1.combine_first(s2) duck 30.0 eagle 160.0 @@ -3527,8 +3557,8 @@ def update(self, other: Series | Sequence | Mapping) -> None: 2 6 dtype: int64 - >>> s = pd.Series(['a', 'b', 'c']) - >>> s.update(pd.Series(['d', 'e'], index=[0, 2])) + >>> s = pd.Series(["a", "b", "c"]) + >>> s.update(pd.Series(["d", "e"], index=[0, 2])) >>> s 0 d 1 b @@ -3731,7 +3761,7 @@ def sort_values( Sort values putting NAs first - >>> s.sort_values(na_position='first') + >>> s.sort_values(na_position="first") 0 NaN 1 1.0 2 3.0 @@ -3741,7 +3771,7 @@ def sort_values( Sort a series of strings - >>> s = pd.Series(['z', 'b', 'd', 'a', 'c']) + >>> s = pd.Series(["z", "b", "d", "a", "c"]) >>> s 0 z 1 b @@ -3761,7 +3791,7 @@ def sort_values( Sort using a key function. Your `key` function will be given the ``Series`` of values and should return an array-like. - >>> s = pd.Series(['a', 'B', 'c', 'D', 'e']) + >>> s = pd.Series(["a", "B", "c", "D", "e"]) >>> s.sort_values() 1 B 3 D @@ -3959,7 +3989,7 @@ def sort_index( Examples -------- - >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) + >>> s = pd.Series(["a", "b", "c", "d"], index=[3, 2, 1, 4]) >>> s.sort_index() 1 c 2 b @@ -3979,8 +4009,8 @@ def sort_index( By default NaNs are put at the end, but use `na_position` to place them at the beginning - >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan]) - >>> s.sort_index(na_position='first') + >>> s = pd.Series(["a", "b", "c", "d"], index=[3, 2, 1, np.nan]) + >>> s.sort_index(na_position="first") NaN d 1.0 c 2.0 b @@ -3989,10 +4019,32 @@ def sort_index( Specify index level to sort - >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo', - ... 'baz', 'baz', 'bar', 'bar']), - ... np.array(['two', 'one', 'two', 'one', - ... 'two', 'one', 'two', 'one'])] + >>> arrays = [ + ... np.array( + ... [ + ... "qux", + ... "qux", + ... "foo", + ... "foo", + ... "baz", + ... "baz", + ... "bar", + ... "bar", + ... ] + ... ), + ... np.array( + ... [ + ... "two", + ... "one", + ... "two", + ... "one", + ... "two", + ... "one", + ... "two", + ... "one", + ... ] + ... ), + ... ] >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays) >>> s.sort_index(level=1) bar one 8 @@ -4020,8 +4072,8 @@ def sort_index( Apply a key function before sorting - >>> s = pd.Series([1, 2, 3, 4], index=['A', 'b', 'C', 'd']) - >>> s.sort_index(key=lambda x : x.str.lower()) + >>> s = pd.Series([1, 2, 3, 4], index=["A", "b", "C", "d"]) + >>> s.sort_index(key=lambda x: x.str.lower()) A 1 b 2 C 3 @@ -4150,11 +4202,18 @@ def nlargest( Examples -------- - >>> countries_population = {"Italy": 59000000, "France": 65000000, - ... "Malta": 434000, "Maldives": 434000, - ... "Brunei": 434000, "Iceland": 337000, - ... "Nauru": 11300, "Tuvalu": 11300, - ... "Anguilla": 11300, "Montserrat": 5200} + >>> countries_population = { + ... "Italy": 59000000, + ... "France": 65000000, + ... "Malta": 434000, + ... "Maldives": 434000, + ... "Brunei": 434000, + ... "Iceland": 337000, + ... "Nauru": 11300, + ... "Tuvalu": 11300, + ... "Anguilla": 11300, + ... "Montserrat": 5200, + ... } >>> s = pd.Series(countries_population) >>> s Italy 59000000 @@ -4192,7 +4251,7 @@ def nlargest( Brunei will be kept since it is the last with value 434000 based on the index order. - >>> s.nlargest(3, keep='last') + >>> s.nlargest(3, keep="last") France 65000000 Italy 59000000 Brunei 434000 @@ -4201,7 +4260,7 @@ def nlargest( The `n` largest elements where ``n=3`` with all duplicates kept. Note that the returned Series has five elements due to the three duplicates. - >>> s.nlargest(3, keep='all') + >>> s.nlargest(3, keep="all") France 65000000 Italy 59000000 Malta 434000 @@ -4250,11 +4309,18 @@ def nsmallest( Examples -------- - >>> countries_population = {"Italy": 59000000, "France": 65000000, - ... "Brunei": 434000, "Malta": 434000, - ... "Maldives": 434000, "Iceland": 337000, - ... "Nauru": 11300, "Tuvalu": 11300, - ... "Anguilla": 11300, "Montserrat": 5200} + >>> countries_population = { + ... "Italy": 59000000, + ... "France": 65000000, + ... "Brunei": 434000, + ... "Malta": 434000, + ... "Maldives": 434000, + ... "Iceland": 337000, + ... "Nauru": 11300, + ... "Tuvalu": 11300, + ... "Anguilla": 11300, + ... "Montserrat": 5200, + ... } >>> s = pd.Series(countries_population) >>> s Italy 59000000 @@ -4292,7 +4358,7 @@ def nsmallest( duplicates. Anguilla and Tuvalu will be kept since they are the last with value 11300 based on the index order. - >>> s.nsmallest(3, keep='last') + >>> s.nsmallest(3, keep="last") Montserrat 5200 Anguilla 11300 Tuvalu 11300 @@ -4301,7 +4367,7 @@ def nsmallest( The `n` smallest elements where ``n=3`` with all duplicates kept. Note that the returned Series has four elements due to the three duplicates. - >>> s.nsmallest(3, keep='all') + >>> s.nsmallest(3, keep="all") Montserrat 5200 Nauru 11300 Tuvalu 11300 @@ -4425,8 +4491,19 @@ def reorder_levels(self, order: Sequence[Level]) -> Series: Examples -------- - >>> arrays = [np.array(["dog", "dog", "cat", "cat", "bird", "bird"]), - ... np.array(["white", "black", "white", "black", "white", "black"])] + >>> arrays = [ + ... np.array(["dog", "dog", "cat", "cat", "bird", "bird"]), + ... np.array( + ... [ + ... "white", + ... "black", + ... "white", + ... "black", + ... "white", + ... "black", + ... ] + ... ), + ... ] >>> s = pd.Series([1, 2, 3, 3, 5, 2], index=arrays) >>> s dog white 1 @@ -4488,7 +4565,7 @@ def explode(self, ignore_index: bool = False) -> Series: Examples -------- - >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s = pd.Series([[1, 2, 3], "foo", [], [3, 4]]) >>> s 0 [1, 2, 3] 1 foo @@ -4550,9 +4627,12 @@ def unstack( Examples -------- - >>> s = pd.Series([1, 2, 3, 4], - ... index=pd.MultiIndex.from_product([['one', 'two'], - ... ['a', 'b']])) + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.MultiIndex.from_product( + ... [["one", "two"], ["a", "b"]] + ... ), + ... ) >>> s one a 1 b 2 @@ -4619,7 +4699,7 @@ def map( Examples -------- - >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit']) + >>> s = pd.Series(["cat", "dog", np.nan, "rabbit"]) >>> s 0 cat 1 dog @@ -4631,7 +4711,7 @@ def map( in the ``dict`` are converted to ``NaN``, unless the dict has a default value (e.g. ``defaultdict``): - >>> s.map({'cat': 'kitten', 'dog': 'puppy'}) + >>> s.map({"cat": "kitten", "dog": "puppy"}) 0 kitten 1 puppy 2 NaN @@ -4640,7 +4720,7 @@ def map( It also accepts a function: - >>> s.map('I am a {}'.format) + >>> s.map("I am a {}".format) 0 I am a cat 1 I am a dog 2 I am a nan @@ -4650,7 +4730,7 @@ def map( To avoid applying the function to missing values (and keep them as ``NaN``) ``na_action='ignore'`` can be used: - >>> s.map('I am a {}'.format, na_action='ignore') + >>> s.map("I am a {}".format, na_action="ignore") 0 I am a cat 1 I am a dog 2 NaN @@ -4811,8 +4891,9 @@ def apply( -------- Create a series with typical summer temperatures for each city. - >>> s = pd.Series([20, 21, 12], - ... index=['London', 'New York', 'Helsinki']) + >>> s = pd.Series( + ... [20, 21, 12], index=["London", "New York", "Helsinki"] + ... ) >>> s London 20 New York 21 @@ -4823,7 +4904,7 @@ def apply( argument to ``apply()``. >>> def square(x): - ... return x ** 2 + ... return x**2 >>> s.apply(square) London 400 New York 441 @@ -4833,7 +4914,7 @@ def apply( Square the values by passing an anonymous function as an argument to ``apply()``. - >>> s.apply(lambda x: x ** 2) + >>> s.apply(lambda x: x**2) London 400 New York 441 Helsinki 144 @@ -5027,7 +5108,7 @@ def rename( 1 2 2 3 Name: my_name, dtype: int64 - >>> s.rename(lambda x: x ** 2) # function, changes labels + >>> s.rename(lambda x: x**2) # function, changes labels 0 1 1 2 4 3 @@ -5271,7 +5352,7 @@ def drop( Examples -------- - >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s = pd.Series(data=np.arange(3), index=["A", "B", "C"]) >>> s A 0 B 1 @@ -5280,18 +5361,25 @@ def drop( Drop labels B en C - >>> s.drop(labels=['B', 'C']) + >>> s.drop(labels=["B", "C"]) A 0 dtype: int64 Drop 2nd level label in MultiIndex Series - >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], - ... index=midx) + >>> midx = pd.MultiIndex( + ... levels=[ + ... ["llama", "cow", "falcon"], + ... ["speed", "weight", "length"], + ... ], + ... codes=[ + ... [0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2], + ... ], + ... ) + >>> s = pd.Series( + ... [45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx + ... ) >>> s llama speed 45.0 weight 200.0 @@ -5304,7 +5392,7 @@ def drop( length 0.3 dtype: float64 - >>> s.drop(labels='weight', level=1) + >>> s.drop(labels="weight", level=1) llama speed 45.0 length 1.2 cow speed 30.0 @@ -5473,9 +5561,11 @@ def isin(self, values) -> Series: Examples -------- - >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', - ... 'hippo'], name='animal') - >>> s.isin(['cow', 'llama']) + >>> s = pd.Series( + ... ["llama", "cow", "llama", "beetle", "llama", "hippo"], + ... name="animal", + ... ) + >>> s.isin(["cow", "llama"]) 0 True 1 True 2 True @@ -5486,7 +5576,7 @@ def isin(self, values) -> Series: To invert the boolean values, use the ``~`` operator: - >>> ~s.isin(['cow', 'llama']) + >>> ~s.isin(["cow", "llama"]) 0 False 1 False 2 False @@ -5498,7 +5588,7 @@ def isin(self, values) -> Series: Passing a single string as ``s.isin('llama')`` will raise an error. Use a list of one element instead: - >>> s.isin(['llama']) + >>> s.isin(["llama"]) 0 True 1 False 2 True @@ -5509,10 +5599,10 @@ def isin(self, values) -> Series: Strings and integers are distinct and are therefore not comparable: - >>> pd.Series([1]).isin(['1']) + >>> pd.Series([1]).isin(["1"]) 0 False dtype: bool - >>> pd.Series([1.1]).isin(['1.1']) + >>> pd.Series([1.1]).isin(["1.1"]) 0 False dtype: bool """ @@ -5586,8 +5676,8 @@ def between( `left` and `right` can be any scalar value: - >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve']) - >>> s.between('Anna', 'Daniel') + >>> s = pd.Series(["Alice", "Bob", "Carol", "Eve"]) + >>> s.between("Anna", "Daniel") 0 False 1 True 2 True @@ -5704,7 +5794,7 @@ def dropna( Examples -------- - >>> ser = pd.Series([1., 2., np.nan]) + >>> ser = pd.Series([1.0, 2.0, np.nan]) >>> ser 0 1.0 1 2.0 @@ -5721,7 +5811,7 @@ def dropna( Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = pd.Series([np.nan, 2, pd.NaT, '', None, 'I stay']) + >>> ser = pd.Series([np.nan, 2, pd.NaT, "", None, "I stay"]) >>> ser 0 NaN 1 2 @@ -5797,7 +5887,7 @@ def to_timestamp( Examples -------- - >>> idx = pd.PeriodIndex(['2023', '2024', '2025'], freq='Y') + >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") >>> s1 = pd.Series([1, 2, 3], index=idx) >>> s1 2023 1 @@ -5817,7 +5907,7 @@ def to_timestamp( Using `freq` which is the offset that the Timestamps will have >>> s2 = pd.Series([1, 2, 3], index=idx) - >>> s2 = s2.to_timestamp(freq='M') + >>> s2 = s2.to_timestamp(freq="M") >>> s2 2023-01-31 1 2024-01-31 2 @@ -5862,7 +5952,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series Examples -------- - >>> idx = pd.DatetimeIndex(['2023', '2024', '2025']) + >>> idx = pd.DatetimeIndex(["2023", "2024", "2025"]) >>> s = pd.Series([1, 2, 3], index=idx) >>> s = s.to_period() >>> s diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 25f7e7e9f832b..3369df5da4cba 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -2,9 +2,7 @@ _shared_docs: dict[str, str] = {} -_shared_docs[ - "aggregate" -] = """ +_shared_docs["aggregate"] = """ Aggregate using one or more operations over the specified axis. Parameters @@ -53,9 +51,7 @@ A passed user-defined-function will be passed a Series for evaluation. {examples}""" -_shared_docs[ - "compare" -] = """ +_shared_docs["compare"] = """ Compare to another {klass} and show the differences. Parameters @@ -85,9 +81,7 @@ .. versionadded:: 1.5.0 """ -_shared_docs[ - "groupby" -] = """ +_shared_docs["groupby"] = """ Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the @@ -195,9 +189,7 @@ iterating through groups, selecting a group, aggregation, and more. """ -_shared_docs[ - "melt" -] = """ +_shared_docs["melt"] = """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. This function is useful to massage a DataFrame into a format where one @@ -311,9 +303,7 @@ 2 c B E 5 """ -_shared_docs[ - "transform" -] = """ +_shared_docs["transform"] = """ Call ``func`` on self producing a {klass} with the same axis shape as self. Parameters @@ -438,9 +428,7 @@ 6 2 n 4 """ -_shared_docs[ - "storage_options" -] = """storage_options : dict, optional +_shared_docs["storage_options"] = """storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to ``urllib.request.Request`` as header options. For other @@ -450,9 +438,7 @@ `_.""" -_shared_docs[ - "compression_options" -] = """compression : str or dict, default 'infer' +_shared_docs["compression_options"] = """compression : str or dict, default 'infer' For on-the-fly compression of the output data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' @@ -471,9 +457,7 @@ .. versionadded:: 1.5.0 Added support for `.tar` files.""" -_shared_docs[ - "decompression_options" -] = """compression : str or dict, default 'infer' +_shared_docs["decompression_options"] = """compression : str or dict, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' @@ -493,9 +477,7 @@ .. versionadded:: 1.5.0 Added support for `.tar` files.""" -_shared_docs[ - "replace" -] = """ +_shared_docs["replace"] = """ Replace values given in `to_replace` with `value`. Values of the {klass} are replaced with other values dynamically. @@ -817,9 +799,7 @@ 4 4 e e """ -_shared_docs[ - "idxmin" -] = """ +_shared_docs["idxmin"] = """ Return index of first occurrence of minimum over requested axis. NA/null values are excluded. @@ -884,9 +864,7 @@ dtype: object """ -_shared_docs[ - "idxmax" -] = """ +_shared_docs["idxmax"] = """ Return index of first occurrence of maximum over requested axis. NA/null values are excluded. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 75866c6f6013a..be2433739479e 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -538,20 +538,20 @@ def cat( When not passing `others`, all values are concatenated into a single string: - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') + >>> s = pd.Series(["a", "b", np.nan, "d"]) + >>> s.str.cat(sep=" ") 'a b d' By default, NA values in the Series are ignored. Using `na_rep`, they can be given a representation: - >>> s.str.cat(sep=' ', na_rep='?') + >>> s.str.cat(sep=" ", na_rep="?") 'a b ? d' If `others` is specified, corresponding values are concatenated with the separator. Result will be a Series of strings. - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + >>> s.str.cat(["A", "B", "C", "D"], sep=",") 0 a,A 1 b,B 2 NaN @@ -561,7 +561,7 @@ def cat( Missing values will remain missing in the result, but can again be represented using `na_rep` - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + >>> s.str.cat(["A", "B", "C", "D"], sep=",", na_rep="-") 0 a,A 1 b,B 2 -,C @@ -571,7 +571,7 @@ def cat( If `sep` is not specified, the values are concatenated without separation. - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + >>> s.str.cat(["A", "B", "C", "D"], na_rep="-") 0 aA 1 bB 2 -C @@ -581,15 +581,15 @@ def cat( Series with different indexes can be aligned before concatenation. The `join`-keyword works as in other methods. - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') + >>> t = pd.Series(["d", "a", "e", "c"], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join="left", na_rep="-") 0 aa 1 b- 2 -c 3 dd dtype: object >>> - >>> s.str.cat(t, join='outer', na_rep='-') + >>> s.str.cat(t, join="outer", na_rep="-") 0 aa 1 b- 2 -c @@ -597,13 +597,13 @@ def cat( 4 -e dtype: object >>> - >>> s.str.cat(t, join='inner', na_rep='-') + >>> s.str.cat(t, join="inner", na_rep="-") 0 aa 2 -c 3 dd dtype: object >>> - >>> s.str.cat(t, join='right', na_rep='-') + >>> s.str.cat(t, join="right", na_rep="-") 3 dd 0 aa 4 -e @@ -708,9 +708,7 @@ def cat( out = res_ser.__finalize__(self._orig, method="str_cat") return out - _shared_docs[ - "str_split" - ] = r""" + _shared_docs["str_split"] = r""" Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, @@ -947,9 +945,7 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): result, expand=expand, returns_string=expand, dtype=dtype ) - _shared_docs[ - "str_partition" - ] = """ + _shared_docs["str_partition"] = """ Split the string at the %(side)s occurrence of `sep`. This method splits the string at the %(side)s occurrence of `sep`, @@ -1087,12 +1083,16 @@ def get(self, i): Examples -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) + >>> s = pd.Series( + ... [ + ... "String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}, + ... ] + ... ) >>> s 0 String 1 (1, 2, 3) @@ -1122,9 +1122,13 @@ def get(self, i): Return element with given key - >>> s = pd.Series([{"name": "Hello", "value": "World"}, - ... {"name": "Goodbye", "value": "Planet"}]) - >>> s.str.get('name') + >>> s = pd.Series( + ... [ + ... {"name": "Hello", "value": "World"}, + ... {"name": "Goodbye", "value": "Planet"}, + ... ] + ... ) + >>> s.str.get("name") 0 Hello 1 Goodbye dtype: object @@ -1171,11 +1175,15 @@ def join(self, sep: str): -------- Example with a list that contains non-string elements. - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s = pd.Series( + ... [ + ... ["lion", "elephant", "zebra"], + ... [1.1, 2.2, 3.3], + ... ["cat", np.nan, "dog"], + ... ["cow", 4.5, "goat"], + ... ["duck", ["swan", "fish"], "guppy"], + ... ] + ... ) >>> s 0 [lion, elephant, zebra] 1 [1.1, 2.2, 3.3] @@ -1187,7 +1195,7 @@ def join(self, sep: str): Join all lists using a '-'. The lists containing object(s) of types other than str will produce a NaN. - >>> s.str.join('-') + >>> s.str.join("-") 0 lion-elephant-zebra 1 NaN 2 NaN @@ -1243,8 +1251,10 @@ def contains( -------- Returning a Series of booleans using only a literal pattern. - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan]) - >>> s1.str.contains('og', regex=False) + >>> s1 = pd.Series( + ... ["Mouse", "dog", "house and parrot", "23", np.nan] + ... ) + >>> s1.str.contains("og", regex=False) 0 False 1 True 2 False @@ -1254,13 +1264,15 @@ def contains( Returning an Index of booleans using only a literal pattern. - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan]) - >>> ind.str.contains('23', regex=False) + >>> ind = pd.Index( + ... ["Mouse", "dog", "house and parrot", "23.0", np.nan] + ... ) + >>> ind.str.contains("23", regex=False) Index([False, False, False, True, nan], dtype='object') Specifying case sensitivity using `case`. - >>> s1.str.contains('oG', case=True, regex=True) + >>> s1.str.contains("oG", case=True, regex=True) 0 False 1 False 2 False @@ -1272,7 +1284,7 @@ def contains( with `False`. If Series or Index does not contain NaN values the resultant dtype will be `bool`, otherwise, an `object` dtype. - >>> s1.str.contains('og', na=False, regex=True) + >>> s1.str.contains("og", na=False, regex=True) 0 False 1 True 2 False @@ -1282,7 +1294,7 @@ def contains( Returning 'house' or 'dog' when either expression occurs in a string. - >>> s1.str.contains('house|dog', regex=True) + >>> s1.str.contains("house|dog", regex=True) 0 False 1 True 2 True @@ -1293,7 +1305,7 @@ def contains( Ignoring case sensitivity using `flags` with regex. >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + >>> s1.str.contains("PARROT", flags=re.IGNORECASE, regex=True) 0 False 1 False 2 True @@ -1303,7 +1315,7 @@ def contains( Returning any digit using regular expression. - >>> s1.str.contains('\\d', regex=True) + >>> s1.str.contains("\\d", regex=True) 0 False 1 False 2 False @@ -1316,8 +1328,8 @@ def contains( return `True`. However, '.0' as a regex matches any character followed by a 0. - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) + >>> s2 = pd.Series(["40", "40.0", "41", "41.0", "35"]) + >>> s2.str.contains(".0", regex=True) 0 True 1 True 2 False @@ -1408,7 +1420,7 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): Examples -------- >>> ser = pd.Series(["cat", "duck", "dove"]) - >>> ser.str.fullmatch(r'd.+') + >>> ser.str.fullmatch(r"d.+") 0 False 1 True 2 True @@ -1487,7 +1499,9 @@ def replace( regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are left as is: - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + >>> pd.Series(["foo", "fuz", np.nan]).str.replace( + ... "f.", "ba", regex=True + ... ) 0 bao 1 baz 2 NaN @@ -1496,7 +1510,9 @@ def replace( When `pat` is a string and `regex` is False, every `pat` is replaced with `repl` as with :meth:`str.replace`: - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + >>> pd.Series(["f.o", "fuz", np.nan]).str.replace( + ... "f.", "ba", regex=False + ... ) 0 bao 1 fuz 2 NaN @@ -1508,7 +1524,9 @@ def replace( To get the idea: - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True) + >>> pd.Series(["foo", "fuz", np.nan]).str.replace( + ... "f", repr, regex=True + ... ) 0 oo 1 uz 2 NaN @@ -1517,8 +1535,8 @@ def replace( Reverse every lowercase alphabetic word: >>> repl = lambda m: m.group(0)[::-1] - >>> ser = pd.Series(['foo 123', 'bar baz', np.nan]) - >>> ser.str.replace(r'[a-z]+', repl, regex=True) + >>> ser = pd.Series(["foo 123", "bar baz", np.nan]) + >>> ser.str.replace(r"[a-z]+", repl, regex=True) 0 oof 123 1 rab zab 2 NaN @@ -1527,8 +1545,8 @@ def replace( Using regex groups (extract second group and swap case): >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz']) + >>> repl = lambda m: m.group("two").swapcase() + >>> ser = pd.Series(["One Two Three", "Foo Bar Baz"]) >>> ser.str.replace(pat, repl, regex=True) 0 tWO 1 bAR @@ -1537,8 +1555,10 @@ def replace( Using a compiled regex with flags >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True) + >>> regex_pat = re.compile(r"FUZ", flags=re.IGNORECASE) + >>> pd.Series(["foo", "fuz", np.nan]).str.replace( + ... regex_pat, "bar", regex=True + ... ) 0 foo 1 bar 2 NaN @@ -1588,7 +1608,7 @@ def repeat(self, repeats): Examples -------- - >>> s = pd.Series(['a', 'b', 'c']) + >>> s = pd.Series(["a", "b", "c"]) >>> s 0 a 1 b @@ -1663,12 +1683,12 @@ def pad( 1 tiger dtype: object - >>> s.str.pad(width=10, side='right', fillchar='-') + >>> s.str.pad(width=10, side="right", fillchar="-") 0 caribou--- 1 tiger----- dtype: object - >>> s.str.pad(width=10, side='both', fillchar='-') + >>> s.str.pad(width=10, side="both", fillchar="-") 0 -caribou-- 1 --tiger--- dtype: object @@ -1687,9 +1707,7 @@ def pad( result = self._data.array._str_pad(width, side=side, fillchar=fillchar) return self._wrap_result(result) - _shared_docs[ - "str_pad" - ] = """ + _shared_docs["str_pad"] = """ Pad %(side)s side of strings in the Series/Index. Equivalent to :meth:`str.%(method)s`. @@ -1789,7 +1807,7 @@ def zfill(self, width: int): Examples -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s = pd.Series(["-1", "1", "1000", 10, np.nan]) >>> s 0 -1 1 1 @@ -1924,7 +1942,7 @@ def slice_replace(self, start=None, stop=None, repl=None): Examples -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s = pd.Series(["a", "ab", "abc", "abdc", "abcde"]) >>> s 0 a 1 ab @@ -1936,7 +1954,7 @@ def slice_replace(self, start=None, stop=None, repl=None): Specify just `start`, meaning replace `start` until the end of the string with `repl`. - >>> s.str.slice_replace(1, repl='X') + >>> s.str.slice_replace(1, repl="X") 0 aX 1 aX 2 aX @@ -1947,7 +1965,7 @@ def slice_replace(self, start=None, stop=None, repl=None): Specify just `stop`, meaning the start of the string to `stop` is replaced with `repl`, and the rest of the string is included. - >>> s.str.slice_replace(stop=2, repl='X') + >>> s.str.slice_replace(stop=2, repl="X") 0 X 1 X 2 Xc @@ -1959,7 +1977,7 @@ def slice_replace(self, start=None, stop=None, repl=None): replaced with `repl`. Everything before or after `start` and `stop` is included as is. - >>> s.str.slice_replace(start=1, stop=3, repl='X') + >>> s.str.slice_replace(start=1, stop=3, repl="X") 0 aX 1 aX 2 aX @@ -1990,8 +2008,8 @@ def decode(self, encoding, errors: str = "strict"): -------- For Series: - >>> ser = pd.Series([b'cow', b'123', b'()']) - >>> ser.str.decode('ascii') + >>> ser = pd.Series([b"cow", b"123", b"()"]) + >>> ser.str.decode("ascii") 0 cow 1 123 2 () @@ -2027,8 +2045,8 @@ def encode(self, encoding, errors: str = "strict"): Examples -------- - >>> ser = pd.Series(['cow', '123', '()']) - >>> ser.str.encode(encoding='ascii') + >>> ser = pd.Series(["cow", "123", "()"]) + >>> ser.str.encode(encoding="ascii") 0 b'cow' 1 b'123' 2 b'()' @@ -2037,9 +2055,7 @@ def encode(self, encoding, errors: str = "strict"): result = self._data.array._str_encode(encoding, errors) return self._wrap_result(result, returns_string=False) - _shared_docs[ - "str_strip" - ] = r""" + _shared_docs["str_strip"] = r""" Remove %(position)s characters. Strip whitespaces (including newlines) or a set of specified characters @@ -2144,9 +2160,7 @@ def rstrip(self, to_strip=None): result = self._data.array._str_rstrip(to_strip) return self._wrap_result(result) - _shared_docs[ - "str_removefix" - ] = r""" + _shared_docs["str_removefix"] = r""" Remove a %(side)s from an object series. If the %(side)s is not present, the original string will be returned. @@ -2258,7 +2272,9 @@ def wrap(self, width: int, **kwargs): Examples -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s = pd.Series( + ... ["line to be wrapped", "another line to be wrapped"] + ... ) >>> s.str.wrap(12) 0 line to be\nwrapped 1 another line\nto be\nwrapped @@ -2292,13 +2308,13 @@ def get_dummies(self, sep: str = "|"): Examples -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + >>> pd.Series(["a|b", "a", "a|c"]).str.get_dummies() a b c 0 1 1 0 1 1 0 0 2 1 0 1 - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies() a b c 0 1 1 0 1 0 0 0 @@ -2336,7 +2352,7 @@ def translate(self, table): Examples -------- >>> ser = pd.Series(["El niño", "Françoise"]) - >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'}) + >>> mytable = str.maketrans({"ñ": "n", "ç": "c"}) >>> ser.str.translate(mytable) 0 El nino 1 Francoise @@ -2383,8 +2399,10 @@ def count(self, pat, flags: int = 0): Examples -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') + >>> s = pd.Series( + ... ["A", "B", "Aaba", "Baca", np.nan, "CABA", "cat"] + ... ) + >>> s.str.count("a") 0 0.0 1 0.0 2 2.0 @@ -2396,8 +2414,8 @@ def count(self, pat, flags: int = 0): Escape ``'$'`` to find the literal dollar sign. - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') + >>> s = pd.Series(["$", "B", "Aab$", "$$ca", "C$B$", "cat"]) + >>> s.str.count("\\$") 0 1 1 0 2 1 @@ -2408,7 +2426,7 @@ def count(self, pat, flags: int = 0): This is also available on Index - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + >>> pd.Index(["A", "A", "Aaba", "cat"]).str.count("a") Index([0, 0, 2, 1], dtype='int64') """ result = self._data.array._str_count(pat, flags) @@ -2447,7 +2465,7 @@ def startswith( Examples -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s = pd.Series(["bat", "Bear", "cat", np.nan]) >>> s 0 bat 1 Bear @@ -2455,14 +2473,14 @@ def startswith( 3 NaN dtype: object - >>> s.str.startswith('b') + >>> s.str.startswith("b") 0 True 1 False 2 False 3 NaN dtype: object - >>> s.str.startswith(('b', 'B')) + >>> s.str.startswith(("b", "B")) 0 True 1 True 2 False @@ -2471,7 +2489,7 @@ def startswith( Specifying `na` to be `False` instead of `NaN`. - >>> s.str.startswith('b', na=False) + >>> s.str.startswith("b", na=False) 0 True 1 False 2 False @@ -2517,7 +2535,7 @@ def endswith( Examples -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s = pd.Series(["bat", "bear", "caT", np.nan]) >>> s 0 bat 1 bear @@ -2525,14 +2543,14 @@ def endswith( 3 NaN dtype: object - >>> s.str.endswith('t') + >>> s.str.endswith("t") 0 True 1 False 2 False 3 NaN dtype: object - >>> s.str.endswith(('t', 'T')) + >>> s.str.endswith(("t", "T")) 0 True 1 False 2 True @@ -2541,7 +2559,7 @@ def endswith( Specifying `na` to be `False` instead of `NaN`. - >>> s.str.endswith('t', na=False) + >>> s.str.endswith("t", na=False) 0 True 1 False 2 False @@ -2588,11 +2606,11 @@ def findall(self, pat, flags: int = 0): Examples -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + >>> s = pd.Series(["Lion", "Monkey", "Rabbit"]) The search for the pattern 'Monkey' returns one match: - >>> s.str.findall('Monkey') + >>> s.str.findall("Monkey") 0 [] 1 [Monkey] 2 [] @@ -2601,7 +2619,7 @@ def findall(self, pat, flags: int = 0): On the other hand, the search for the pattern 'MONKEY' doesn't return any match: - >>> s.str.findall('MONKEY') + >>> s.str.findall("MONKEY") 0 [] 1 [] 2 [] @@ -2611,7 +2629,7 @@ def findall(self, pat, flags: int = 0): to find the pattern 'MONKEY' ignoring the case: >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + >>> s.str.findall("MONKEY", flags=re.IGNORECASE) 0 [] 1 [Monkey] 2 [] @@ -2620,7 +2638,7 @@ def findall(self, pat, flags: int = 0): When the pattern matches more than one string in the Series, all matches are returned: - >>> s.str.findall('on') + >>> s.str.findall("on") 0 [on] 1 [on] 2 [] @@ -2629,7 +2647,7 @@ def findall(self, pat, flags: int = 0): Regular expressions are supported too. For instance, the search for all the strings ending with the word 'on' is shown next: - >>> s.str.findall('on$') + >>> s.str.findall("on$") 0 [on] 1 [] 2 [] @@ -2638,7 +2656,7 @@ def findall(self, pat, flags: int = 0): If the pattern is found more than once in the same string, then a list of multiple strings is returned: - >>> s.str.findall('b') + >>> s.str.findall("b") 0 [] 1 [] 2 [b, b] @@ -2691,8 +2709,8 @@ def extract( A pattern with two groups will return a DataFrame with two columns. Non-matches will be NaN. - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') + >>> s = pd.Series(["a1", "b2", "c3"]) + >>> s.str.extract(r"([ab])(\d)") 0 1 0 a 1 1 b 2 @@ -2700,7 +2718,7 @@ def extract( A pattern may contain optional groups. - >>> s.str.extract(r'([ab])?(\d)') + >>> s.str.extract(r"([ab])?(\d)") 0 1 0 a 1 1 b 2 @@ -2708,7 +2726,7 @@ def extract( Named groups will become column names in the result. - >>> s.str.extract(r'(?P[ab])(?P\d)') + >>> s.str.extract(r"(?P[ab])(?P\d)") letter digit 0 a 1 1 b 2 @@ -2717,7 +2735,7 @@ def extract( A pattern with one group will return a DataFrame with one column if expand=True. - >>> s.str.extract(r'[ab](\d)', expand=True) + >>> s.str.extract(r"[ab](\d)", expand=True) 0 0 1 1 2 @@ -2725,7 +2743,7 @@ def extract( A pattern with one group will return a Series if expand=False. - >>> s.str.extract(r'[ab](\d)', expand=False) + >>> s.str.extract(r"[ab](\d)", expand=False) 0 1 1 2 2 NaN @@ -2853,9 +2871,7 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: # TODO: dispatch return str_extractall(self._orig, pat, flags) - _shared_docs[ - "find" - ] = """ + _shared_docs["find"] = """ Return %(side)s indexes in each strings in the Series/Index. Each of returned indexes corresponds to the position where the @@ -2953,17 +2969,15 @@ def normalize(self, form): Examples -------- - >>> ser = pd.Series(['ñ']) - >>> ser.str.normalize('NFC') == ser.str.normalize('NFD') + >>> ser = pd.Series(["ñ"]) + >>> ser.str.normalize("NFC") == ser.str.normalize("NFD") 0 False dtype: bool """ result = self._data.array._str_normalize(form) return self._wrap_result(result) - _shared_docs[ - "index" - ] = """ + _shared_docs["index"] = """ Return %(side)s indexes in each string in Series/Index. Each of the returned indexes corresponds to the position where the @@ -3069,12 +3083,16 @@ def len(self): Returns the length (number of characters) in a string. Returns the number of entries for dictionaries, lists or tuples. - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) + >>> s = pd.Series( + ... [ + ... "dog", + ... "", + ... 5, + ... {"foo": "bar"}, + ... [2, 3, 5, 7], + ... ("one", "two", "three"), + ... ] + ... ) >>> s 0 dog 1 @@ -3095,9 +3113,7 @@ def len(self): result = self._data.array._str_len() return self._wrap_result(result, returns_string=False) - _shared_docs[ - "casemethods" - ] = """ + _shared_docs["casemethods"] = """ Convert strings in the Series/Index to %(type)s. %(version)s Equivalent to :meth:`str.%(method)s`. @@ -3225,9 +3241,7 @@ def casefold(self): result = self._data.array._str_casefold() return self._wrap_result(result) - _shared_docs[ - "ismethods" - ] = """ + _shared_docs["ismethods"] = """ Check whether all characters in each string are %(type)s. This is equivalent to running the Python string method diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 05262c235568d..a85e550397695 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -897,9 +897,9 @@ def to_datetime( can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same - >>> df = pd.DataFrame({'year': [2015, 2016], - ... 'month': [2, 3], - ... 'day': [4, 5]}) + >>> df = pd.DataFrame( + ... {"year": [2015, 2016], "month": [2, 3], "day": [4, 5]} + ... ) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 @@ -907,9 +907,9 @@ def to_datetime( Using a unix epoch time - >>> pd.to_datetime(1490195805, unit='s') + >>> pd.to_datetime(1490195805, unit="s") Timestamp('2017-03-22 15:16:45') - >>> pd.to_datetime(1490195805433502912, unit='ns') + >>> pd.to_datetime(1490195805433502912, unit="ns") Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent @@ -917,8 +917,9 @@ def to_datetime( Using a non-unix epoch origin - >>> pd.to_datetime([1, 2, 3], unit='D', - ... origin=pd.Timestamp('1960-01-01')) + >>> pd.to_datetime( + ... [1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01") + ... ) DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) @@ -926,8 +927,10 @@ def to_datetime( :const:`"%f"` will parse all the way up to nanoseconds. - >>> pd.to_datetime('2018-10-26 12:00:00.0000000011', - ... format='%Y-%m-%d %H:%M:%S.%f') + >>> pd.to_datetime( + ... "2018-10-26 12:00:00.0000000011", + ... format="%Y-%m-%d %H:%M:%S.%f", + ... ) Timestamp('2018-10-26 12:00:00.000000001') **Non-convertible date/times** @@ -935,7 +938,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') + >>> pd.to_datetime("13000101", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -946,14 +949,18 @@ def to_datetime( - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) + >>> pd.to_datetime( + ... ["2018-10-26 12:00:00", "2018-10-26 13:00:15"] + ... ) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) + >>> pd.to_datetime( + ... ["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"] + ... ) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], dtype='datetime64[ns, UTC-05:00]', freq=None) @@ -965,8 +972,9 @@ def to_datetime( and a simple :class:`Index` containing :class:`datetime.datetime` objects will be returned: - >>> pd.to_datetime(['2020-10-25 02:00 +0200', - ... '2020-10-25 04:00 +0100']) # doctest: +SKIP + >>> pd.to_datetime( + ... ["2020-10-25 02:00 +0200", "2020-10-25 04:00 +0100"] + ... ) # doctest: +SKIP FutureWarning: In a future version of pandas, parsing datetimes with mixed time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` @@ -979,8 +987,12 @@ def to_datetime( a simple :class:`Index` containing :class:`datetime.datetime` objects: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", - ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP + >>> pd.to_datetime( + ... [ + ... "2020-01-01 01:00:00-01:00", + ... datetime(2020, 1, 1, 3, 0), + ... ] + ... ) # doctest: +SKIP FutureWarning: In a future version of pandas, parsing datetimes with mixed time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` @@ -994,22 +1006,28 @@ def to_datetime( - Timezone-naive inputs are *localized* as UTC - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) + >>> pd.to_datetime( + ... ["2018-10-26 12:00", "2018-10-26 13:00"], utc=True + ... ) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). - >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], - ... utc=True) + >>> pd.to_datetime( + ... ["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], + ... utc=True, + ... ) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply - >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) + >>> pd.to_datetime( + ... ["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True + ... ) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 09652a7d8bc92..2ae57d3c8508e 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -124,24 +124,24 @@ def to_numeric( -------- Take separate series and convert to numeric, coercing when told to - >>> s = pd.Series(['1.0', '2', -3]) + >>> s = pd.Series(["1.0", "2", -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 - >>> pd.to_numeric(s, downcast='float') + >>> pd.to_numeric(s, downcast="float") 0 1.0 1 2.0 2 -3.0 dtype: float32 - >>> pd.to_numeric(s, downcast='signed') + >>> pd.to_numeric(s, downcast="signed") 0 1 1 2 2 -3 dtype: int8 - >>> s = pd.Series(['apple', '1.0', '2', -3]) - >>> pd.to_numeric(s, errors='coerce') + >>> s = pd.Series(["apple", "1.0", "2", -3]) + >>> pd.to_numeric(s, errors="coerce") 0 NaN 1 1.0 2 2.0 diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index d772c908c4731..7e0331cc26904 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -158,24 +158,24 @@ def to_timedelta( -------- Parsing a single string to a Timedelta: - >>> pd.to_timedelta('1 days 06:05:01.00003') + >>> pd.to_timedelta("1 days 06:05:01.00003") Timedelta('1 days 06:05:01.000030') - >>> pd.to_timedelta('15.5us') + >>> pd.to_timedelta("15.5us") Timedelta('0 days 00:00:00.000015500') Parsing a list or array of strings: - >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + >>> pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"]) TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) Converting numbers by specifying the `unit` keyword argument: - >>> pd.to_timedelta(np.arange(5), unit='s') + >>> pd.to_timedelta(np.arange(5), unit="s") TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) - >>> pd.to_timedelta(np.arange(5), unit='d') + >>> pd.to_timedelta(np.arange(5), unit="d") TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e78bd258c11ff..94177b0945ac9 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -993,7 +993,7 @@ class Window(BaseWindow): Examples -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) >>> df B 0 0.0 @@ -1016,12 +1016,16 @@ class Window(BaseWindow): Rolling sum with a window span of 2 seconds. - >>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - ... index=[pd.Timestamp('20130101 09:00:00'), - ... pd.Timestamp('20130101 09:00:02'), - ... pd.Timestamp('20130101 09:00:03'), - ... pd.Timestamp('20130101 09:00:05'), - ... pd.Timestamp('20130101 09:00:06')]) + >>> df_time = pd.DataFrame( + ... {"B": [0, 1, 2, np.nan, 4]}, + ... index=[ + ... pd.Timestamp("20130101 09:00:00"), + ... pd.Timestamp("20130101 09:00:02"), + ... pd.Timestamp("20130101 09:00:03"), + ... pd.Timestamp("20130101 09:00:05"), + ... pd.Timestamp("20130101 09:00:06"), + ... ], + ... ) >>> df_time B @@ -1031,7 +1035,7 @@ class Window(BaseWindow): 2013-01-01 09:00:05 NaN 2013-01-01 09:00:06 4.0 - >>> df_time.rolling('2s').sum() + >>> df_time.rolling("2s").sum() B 2013-01-01 09:00:00 0.0 2013-01-01 09:00:02 1.0 @@ -1041,7 +1045,9 @@ class Window(BaseWindow): Rolling sum with forward looking windows with 2 observations. - >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) + >>> indexer = pd.api.indexers.FixedForwardWindowIndexer( + ... window_size=2 + ... ) >>> df.rolling(window=indexer, min_periods=1).sum() B 0 1.0 @@ -1099,7 +1105,7 @@ class Window(BaseWindow): Rolling sum with a window length of 2, using the Scipy ``'gaussian'`` window type. ``std`` is required in the aggregation function. - >>> df.rolling(2, win_type='gaussian').sum(std=3) + >>> df.rolling(2, win_type="gaussian").sum(std=3) B 0 NaN 1 0.986207 @@ -1111,12 +1117,17 @@ class Window(BaseWindow): Rolling sum with a window length of 2 days. - >>> df = pd.DataFrame({ - ... 'A': [pd.to_datetime('2020-01-01'), - ... pd.to_datetime('2020-01-01'), - ... pd.to_datetime('2020-01-02'),], - ... 'B': [1, 2, 3], }, - ... index=pd.date_range('2020', periods=3)) + >>> df = pd.DataFrame( + ... { + ... "A": [ + ... pd.to_datetime("2020-01-01"), + ... pd.to_datetime("2020-01-01"), + ... pd.to_datetime("2020-01-02"), + ... ], + ... "B": [1, 2, 3], + ... }, + ... index=pd.date_range("2020", periods=3), + ... ) >>> df A B @@ -1124,7 +1135,7 @@ class Window(BaseWindow): 2020-01-02 2020-01-01 2 2020-01-03 2020-01-02 3 - >>> df.rolling('2D', on='A').sum() + >>> df.rolling("2D", on="A").sum() A B 2020-01-01 2020-01-01 1.0 2020-01-02 2020-01-01 3.0 diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 01094ba36b9dd..058ddb37199ff 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -36,7 +36,9 @@ class NullFrequencyError(ValueError): Examples -------- - >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) + >>> df = pd.DatetimeIndex( + ... ["2011-01-01 10:00", "2011-01-01"], freq=None + ... ) >>> df.shift(2) Traceback (most recent call last): NullFrequencyError: Cannot shift with no freq @@ -49,9 +51,13 @@ class PerformanceWarning(Warning): Examples -------- - >>> df = pd.DataFrame({"jim": [0, 0, 1, 1], - ... "joe": ["x", "x", "z", "y"], - ... "jolie": [1, 2, 3, 4]}) + >>> df = pd.DataFrame( + ... { + ... "jim": [0, 0, 1, 1], + ... "joe": ["x", "x", "z", "y"], + ... "jolie": [1, 2, 3, 4], + ... } + ... ) >>> df = df.set_index(["jim", "joe"]) >>> df jolie @@ -60,7 +66,7 @@ class PerformanceWarning(Warning): x 2 1 z 3 y 4 - >>> df.loc[(1, 'z')] # doctest: +SKIP + >>> df.loc[(1, "z")] # doctest: +SKIP # PerformanceWarning: indexing past lexsort depth may impact performance. df.loc[(1, 'z')] jolie @@ -77,10 +83,13 @@ class UnsupportedFunctionCall(ValueError): Examples -------- - >>> df = pd.DataFrame({"A": [0, 0, 1, 1], - ... "B": ["x", "x", "z", "y"], - ... "C": [1, 2, 3, 4]} - ... ) + >>> df = pd.DataFrame( + ... { + ... "A": [0, 0, 1, 1], + ... "B": ["x", "x", "z", "y"], + ... "C": [1, 2, 3, 4], + ... } + ... ) >>> np.cumsum(df.groupby(["A"])) Traceback (most recent call last): UnsupportedFunctionCall: numpy operations are not valid with groupby. @@ -96,10 +105,13 @@ class UnsortedIndexError(KeyError): Examples -------- - >>> df = pd.DataFrame({"cat": [0, 0, 1, 1], - ... "color": ["white", "white", "brown", "black"], - ... "lives": [4, 4, 3, 7]}, - ... ) + >>> df = pd.DataFrame( + ... { + ... "cat": [0, 0, 1, 1], + ... "color": ["white", "white", "brown", "black"], + ... "lives": [4, 4, 3, 7], + ... }, + ... ) >>> df = df.set_index(["cat", "color"]) >>> df lives @@ -108,7 +120,7 @@ class UnsortedIndexError(KeyError): white 4 1 brown 3 black 7 - >>> df.loc[(0, "black"):(1, "white")] + >>> df.loc[(0, "black") : (1, "white")] Traceback (most recent call last): UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' @@ -133,7 +145,7 @@ class ParserError(ValueError): ... cat,foo,bar ... dog,foo,"baz''' >>> from io import StringIO - >>> pd.read_csv(StringIO(data), skipfooter=1, engine='python') + >>> pd.read_csv(StringIO(data), skipfooter=1, engine="python") Traceback (most recent call last): ParserError: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows @@ -167,11 +179,16 @@ class DtypeWarning(Warning): This example creates and reads a large CSV file with a column that contains `int` and `str`. - >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 + - ... ['1'] * 100000), - ... 'b': ['b'] * 300000}) # doctest: +SKIP - >>> df.to_csv('test.csv', index=False) # doctest: +SKIP - >>> df2 = pd.read_csv('test.csv') # doctest: +SKIP + >>> df = pd.DataFrame( + ... { + ... "a": ( + ... ["1"] * 100000 + ["X"] * 100000 + ["1"] * 100000 + ... ), + ... "b": ["b"] * 300000, + ... } + ... ) # doctest: +SKIP + >>> df.to_csv("test.csv", index=False) # doctest: +SKIP + >>> df2 = pd.read_csv("test.csv") # doctest: +SKIP ... # DtypeWarning: Columns (0) have mixed types Important to notice that ``df2`` will contain both `str` and `int` for the @@ -189,7 +206,9 @@ class DtypeWarning(Warning): One way to solve this issue is using the `dtype` parameter in the `read_csv` and `read_table` functions to explicit the conversion: - >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str}) # doctest: +SKIP + >>> df2 = pd.read_csv( + ... "test.csv", sep=",", dtype={"a": str} + ... ) # doctest: +SKIP No warning was issued. """ @@ -241,12 +260,16 @@ class ParserWarning(Warning): >>> csv = '''a;b;c ... 1;1,8 ... 1;2,1''' - >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') # doctest: +SKIP + >>> df = pd.read_csv( + ... io.StringIO(csv), sep="[;,]" + ... ) # doctest: +SKIP ... # ParserWarning: Falling back to the 'python' engine... Adding `engine='python'` to `pd.read_csv` removes the Warning: - >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]', engine='python') + >>> df = pd.read_csv( + ... io.StringIO(csv), sep="[;,]", engine="python" + ... ) """ @@ -258,13 +281,25 @@ class MergeError(ValueError): Examples -------- - >>> left = pd.DataFrame({"a": ["a", "b", "b", "d"], - ... "b": ["cat", "dog", "weasel", "horse"]}, - ... index=range(4)) - >>> right = pd.DataFrame({"a": ["a", "b", "c", "d"], - ... "c": ["meow", "bark", "chirp", "nay"]}, - ... index=range(4)).set_index("a") - >>> left.join(right, on="a", validate="one_to_one",) + >>> left = pd.DataFrame( + ... { + ... "a": ["a", "b", "b", "d"], + ... "b": ["cat", "dog", "weasel", "horse"], + ... }, + ... index=range(4), + ... ) + >>> right = pd.DataFrame( + ... { + ... "a": ["a", "b", "c", "d"], + ... "c": ["meow", "bark", "chirp", "nay"], + ... }, + ... index=range(4), + ... ).set_index("a") + >>> left.join( + ... right, + ... on="a", + ... validate="one_to_one", + ... ) Traceback (most recent call last): MergeError: Merge keys are not unique in left dataset; not a one-to-one merge """ @@ -279,7 +314,10 @@ class AbstractMethodError(NotImplementedError): >>> class Foo: ... @classmethod ... def classmethod(cls): - ... raise pd.errors.AbstractMethodError(cls, methodtype="classmethod") + ... raise pd.errors.AbstractMethodError( + ... cls, methodtype="classmethod" + ... ) + ... ... def method(self): ... raise pd.errors.AbstractMethodError(self) >>> test = Foo.classmethod() @@ -314,8 +352,10 @@ class NumbaUtilError(Exception): Examples -------- - >>> df = pd.DataFrame({"key": ["a", "a", "b", "b"], "data": [1, 2, 3, 4]}, - ... columns=["key", "data"]) + >>> df = pd.DataFrame( + ... {"key": ["a", "a", "b", "b"], "data": [1, 2, 3, 4]}, + ... columns=["key", "data"], + ... ) >>> def incorrect_function(x): ... return sum(x) * 2.7 >>> df.groupby("key").agg(incorrect_function, engine="numba") @@ -331,10 +371,10 @@ class DuplicateLabelError(ValueError): Examples -------- - >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( + >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( ... allows_duplicate_labels=False ... ) - >>> s.reindex(['a', 'a', 'b']) + >>> s.reindex(["a", "a", "b"]) Traceback (most recent call last): ... DuplicateLabelError: Index has duplicates. @@ -351,8 +391,7 @@ class InvalidIndexError(Exception): Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) - >>> df = pd.DataFrame([[1, 1, 2, 2], - ... [3, 3, 4, 4]], columns=idx) + >>> df = pd.DataFrame([[1, 1, 2, 2], [3, 3, 4, 4]], columns=idx) >>> df x y 0 1 0 1 @@ -373,7 +412,7 @@ class DataError(Exception): Examples -------- - >>> ser = pd.Series(['a', 'b', 'c']) + >>> ser = pd.Series(["a", "b", "c"]) >>> ser.rolling(2).sum() Traceback (most recent call last): DataError: No numeric types to aggregate @@ -394,16 +433,18 @@ class SpecificationError(Exception): Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - ... 'B': range(5), - ... 'C': range(5)}) - >>> df.groupby('A').B.agg({'foo': 'count'}) # doctest: +SKIP + >>> df = pd.DataFrame( + ... {"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)} + ... ) + >>> df.groupby("A").B.agg({"foo": "count"}) # doctest: +SKIP ... # SpecificationError: nested renamer is not supported - >>> df.groupby('A').agg({'B': {'foo': ['sum', 'max']}}) # doctest: +SKIP + >>> df.groupby("A").agg( + ... {"B": {"foo": ["sum", "max"]}} + ... ) # doctest: +SKIP ... # SpecificationError: nested renamer is not supported - >>> df.groupby('A').agg(['min', 'min']) # doctest: +SKIP + >>> df.groupby("A").agg(["min", "min"]) # doctest: +SKIP ... # SpecificationError: nested renamer is not supported """ @@ -423,9 +464,9 @@ class SettingWithCopyError(ValueError): Examples -------- - >>> pd.options.mode.chained_assignment = 'raise' - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP + >>> pd.options.mode.chained_assignment = "raise" + >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2]}, columns=["A"]) + >>> df.loc[0:3]["A"] = "a" # doctest: +SKIP ... # SettingWithCopyError: A value is trying to be set on a copy of a... """ @@ -446,8 +487,8 @@ class SettingWithCopyWarning(Warning): Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2]}, columns=["A"]) + >>> df.loc[0:3]["A"] = "a" # doctest: +SKIP ... # SettingWithCopyWarning: A value is trying to be set on a copy of a... """ @@ -468,8 +509,8 @@ class ChainedAssignmentError(Warning): Examples -------- >>> pd.options.mode.copy_on_write = True - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df["A"][0:3] = 10 # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2]}, columns=["A"]) + >>> df["A"][0:3] = 10 # doctest: +SKIP ... # ChainedAssignmentError: ... >>> pd.options.mode.copy_on_write = False """ @@ -560,11 +601,11 @@ class NumExprClobberingError(NameError): Examples -------- - >>> df = pd.DataFrame({'abs': [1, 1, 1]}) - >>> df.query("abs > 2") # doctest: +SKIP + >>> df = pd.DataFrame({"abs": [1, 1, 1]}) + >>> df.query("abs > 2") # doctest: +SKIP ... # NumExprClobberingError: Variables in expression "(abs) > (2)" overlap... >>> sin, a = 1, 2 - >>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP + >>> pd.eval("sin + a", engine="numexpr") # doctest: +SKIP ... # NumExprClobberingError: Variables in expression "(sin) + (a)" overlap... """ @@ -577,12 +618,12 @@ class UndefinedVariableError(NameError): Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.query("A > x") # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.query("A > x") # doctest: +SKIP ... # UndefinedVariableError: name 'x' is not defined - >>> df.query("A > @y") # doctest: +SKIP + >>> df.query("A > @y") # doctest: +SKIP ... # UndefinedVariableError: local variable 'y' is not defined - >>> pd.eval('x + 1') # doctest: +SKIP + >>> pd.eval("x + 1") # doctest: +SKIP ... # UndefinedVariableError: name 'x' is not defined """ @@ -601,17 +642,19 @@ class IndexingError(Exception): Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.loc[..., ..., 'A'] # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.loc[..., ..., "A"] # doctest: +SKIP ... # IndexingError: indexer may only contain one '...' entry - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.loc[1, ..., ...] # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.loc[1, ..., ...] # doctest: +SKIP ... # IndexingError: Too many indexers - >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP + >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP ... # IndexingError: Unalignable boolean Series provided as indexer... - >>> s = pd.Series(range(2), - ... index = pd.MultiIndex.from_product([["a", "b"], ["c"]])) - >>> s.loc["a", "c", "d"] # doctest: +SKIP + >>> s = pd.Series( + ... range(2), + ... index=pd.MultiIndex.from_product([["a", "b"], ["c"]]), + ... ) + >>> s.loc["a", "c", "d"] # doctest: +SKIP ... # IndexingError: Too many indexers """ @@ -647,14 +690,14 @@ class CSSWarning(UserWarning): Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1]}) + >>> df = pd.DataFrame({"A": [1, 1, 1]}) >>> df.style.applymap( - ... lambda x: 'background-color: blueGreenRed;' - ... ).to_excel('styled.xlsx') # doctest: +SKIP + ... lambda x: "background-color: blueGreenRed;" + ... ).to_excel("styled.xlsx") # doctest: +SKIP CSSWarning: Unhandled color format: 'blueGreenRed' >>> df.style.applymap( - ... lambda x: 'border: 1px solid red red;' - ... ).to_excel('styled.xlsx') # doctest: +SKIP + ... lambda x: "border: 1px solid red red;" + ... ).to_excel("styled.xlsx") # doctest: +SKIP CSSWarning: Unhandled color format: 'blueGreenRed' """ @@ -665,8 +708,8 @@ class PossibleDataLossError(Exception): Examples -------- - >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP - >>> store.open("w") # doctest: +SKIP + >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP + >>> store.open("w") # doctest: +SKIP ... # PossibleDataLossError: Re-opening the file [my-store] with mode [a]... """ @@ -677,9 +720,9 @@ class ClosedFileError(Exception): Examples -------- - >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP - >>> store.close() # doctest: +SKIP - >>> store.keys() # doctest: +SKIP + >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP + >>> store.close() # doctest: +SKIP + >>> store.keys() # doctest: +SKIP ... # ClosedFileError: my-store file is not open! """ @@ -700,12 +743,16 @@ class AttributeConflictWarning(Warning): Examples -------- - >>> idx1 = pd.Index(['a', 'b'], name='name1') + >>> idx1 = pd.Index(["a", "b"], name="name1") >>> df1 = pd.DataFrame([[1, 2], [3, 4]], index=idx1) - >>> df1.to_hdf('file', 'data', 'w', append=True) # doctest: +SKIP - >>> idx2 = pd.Index(['c', 'd'], name='name2') + >>> df1.to_hdf( + ... "file", "data", "w", append=True + ... ) # doctest: +SKIP + >>> idx2 = pd.Index(["c", "d"], name="name2") >>> df2 = pd.DataFrame([[5, 6], [7, 8]], index=idx2) - >>> df2.to_hdf('file', 'data', 'a', append=True) # doctest: +SKIP + >>> df2.to_hdf( + ... "file", "data", "a", append=True + ... ) # doctest: +SKIP AttributeConflictWarning: the [index_name] attribute of the existing index is [name1] which conflicts with the new [name2]... """ @@ -718,8 +765,8 @@ class DatabaseError(OSError): Examples -------- >>> from sqlite3 import connect - >>> conn = connect(':memory:') - >>> pd.read_sql('select * test', conn) # doctest: +SKIP + >>> conn = connect(":memory:") + >>> pd.read_sql("select * test", conn) # doctest: +SKIP ... # DatabaseError: Execution failed on sql 'test': near "test": syntax error """ @@ -733,8 +780,10 @@ class PossiblePrecisionLoss(Warning): Examples -------- - >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) - >>> df.to_stata('test') # doctest: +SKIP + >>> df = pd.DataFrame( + ... {"s": pd.Series([1, 2**53], dtype=np.int64)} + ... ) + >>> df.to_stata("test") # doctest: +SKIP ... # PossiblePrecisionLoss: Column converted from int64 to float64... """ @@ -745,8 +794,10 @@ class ValueLabelTypeMismatch(Warning): Examples -------- - >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")}) - >>> df.to_stata('test') # doctest: +SKIP + >>> df = pd.DataFrame( + ... {"categories": pd.Series(["a", 2], dtype="category")} + ... ) + >>> df.to_stata("test") # doctest: +SKIP ... # ValueLabelTypeMismatch: Stata value labels (pandas categories) must be str... """ @@ -761,7 +812,7 @@ class InvalidColumnName(Warning): Examples -------- >>> df = pd.DataFrame({"0categories": pd.Series([2, 2])}) - >>> df.to_stata('test') # doctest: +SKIP + >>> df.to_stata("test") # doctest: +SKIP ... # InvalidColumnName: Not all pandas column names were valid Stata variable... """ @@ -773,9 +824,11 @@ class CategoricalConversionWarning(Warning): Examples -------- >>> from pandas.io.stata import StataReader - >>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP - ... for i, block in enumerate(reader): - ... print(i, block) + >>> with StataReader( + ... "dta_file", chunksize=2 + ... ) as reader: # doctest: +SKIP + ... for i, block in enumerate(reader): + ... print(i, block) ... # CategoricalConversionWarning: One or more series with value labels... """ diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index a15e37328e9fa..2115198b60c8d 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -64,7 +64,9 @@ def read_clipboard( Examples -------- - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... [[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"] + ... ) >>> df.to_clipboard() # doctest: +SKIP >>> pd.read_clipboard() # doctest: +SKIP A B C diff --git a/pandas/io/common.py b/pandas/io/common.py index 72c9deeb54fc7..f70f631f5db39 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -792,7 +792,9 @@ def get_handle( # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]], # ReadBuffer[bytes], WriteBuffer[bytes]]" handle = _BytesZipFile( - handle, ioargs.mode, **compression_args # type: ignore[arg-type] + handle, + ioargs.mode, + **compression_args, # type: ignore[arg-type] ) if handle.buffer.mode == "r": handles.append(handle) @@ -817,7 +819,8 @@ def get_handle( # type "BaseBuffer"; expected "Union[ReadBuffer[bytes], # WriteBuffer[bytes], None]" handle = _BytesTarFile( - fileobj=handle, **compression_args # type: ignore[arg-type] + fileobj=handle, + **compression_args, # type: ignore[arg-type] ) assert isinstance(handle, _BytesTarFile) if "r" in handle.buffer.mode: @@ -841,7 +844,9 @@ def get_handle( # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], # PathLike[bytes]], IO[bytes]], None]" handle = get_lzma_file()( - handle, ioargs.mode, **compression_args # type: ignore[arg-type] + handle, + ioargs.mode, + **compression_args, # type: ignore[arg-type] ) # Zstd Compression @@ -1137,7 +1142,9 @@ def _maybe_memory_map( # expected "BaseBuffer" wrapped = _IOWrapper( mmap.mmap( - handle.fileno(), 0, access=mmap.ACCESS_READ # type: ignore[arg-type] + handle.fileno(), + 0, + access=mmap.ACCESS_READ, # type: ignore[arg-type] ) ) finally: @@ -1241,7 +1248,9 @@ def dedup_names( Examples -------- - >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) + >>> dedup_names( + ... ["x", "y", "x", "x"], is_potential_multiindex=False + ... ) ['x', 'y', 'x.1', 'x.2'] """ names = list(names) # so we can index diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bce890c6f73b0..9c1de153f58ba 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1005,17 +1005,27 @@ class ExcelWriter(Generic[_WorkbookT]): -------- Default usage: - >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP + >>> df = pd.DataFrame( + ... [["ABC", "XYZ"]], columns=["Foo", "Bar"] + ... ) # doctest: +SKIP >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: ... df.to_excel(writer) # doctest: +SKIP To write to separate sheets in a single file: - >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP - >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP + >>> df1 = pd.DataFrame( + ... [["AAA", "BBB"]], columns=["Spam", "Egg"] + ... ) # doctest: +SKIP + >>> df2 = pd.DataFrame( + ... [["ABC", "XYZ"]], columns=["Foo", "Bar"] + ... ) # doctest: +SKIP >>> with pd.ExcelWriter("path_to_file.xlsx") as writer: - ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP - ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP + ... df1.to_excel( + ... writer, sheet_name="Sheet1" + ... ) # doctest: +SKIP + ... df2.to_excel( + ... writer, sheet_name="Sheet2" + ... ) # doctest: +SKIP You can set the date format or datetime format: @@ -1023,7 +1033,10 @@ class ExcelWriter(Generic[_WorkbookT]): >>> df = pd.DataFrame( ... [ ... [date(2014, 1, 31), date(1999, 9, 24)], - ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ... [ + ... datetime(1998, 5, 26, 23, 33, 4), + ... datetime(2014, 2, 28, 13, 5, 13), + ... ], ... ], ... index=["Date", "Datetime"], ... columns=["X", "Y"], @@ -1031,14 +1044,18 @@ class ExcelWriter(Generic[_WorkbookT]): >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... date_format="YYYY-MM-DD", - ... datetime_format="YYYY-MM-DD HH:MM:SS" + ... datetime_format="YYYY-MM-DD HH:MM:SS", ... ) as writer: ... df.to_excel(writer) # doctest: +SKIP You can also append to an existing Excel file: - >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer: - ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP + >>> with pd.ExcelWriter( + ... "path_to_file.xlsx", mode="a", engine="openpyxl" + ... ) as writer: + ... df.to_excel( + ... writer, sheet_name="Sheet3" + ... ) # doctest: +SKIP Here, the `if_sheet_exists` parameter can be set to replace a sheet if it already exists: @@ -1049,18 +1066,23 @@ class ExcelWriter(Generic[_WorkbookT]): ... engine="openpyxl", ... if_sheet_exists="replace", ... ) as writer: - ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP + ... df.to_excel( + ... writer, sheet_name="Sheet1" + ... ) # doctest: +SKIP You can also write multiple DataFrames to a single sheet. Note that the ``if_sheet_exists`` parameter needs to be set to ``overlay``: - >>> with ExcelWriter("path_to_file.xlsx", + >>> with ExcelWriter( + ... "path_to_file.xlsx", ... mode="a", ... engine="openpyxl", ... if_sheet_exists="overlay", ... ) as writer: ... df1.to_excel(writer, sheet_name="Sheet1") - ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP + ... df2.to_excel( + ... writer, sheet_name="Sheet1", startcol=3 + ... ) # doctest: +SKIP You can store Excel file in RAM: @@ -1073,7 +1095,9 @@ class ExcelWriter(Generic[_WorkbookT]): You can pack Excel file into zip archive: >>> import zipfile # doctest: +SKIP - >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP + >>> df = pd.DataFrame( + ... [["ABC", "XYZ"]], columns=["Foo", "Bar"] + ... ) # doctest: +SKIP >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf: ... with zf.open("filename.xlsx", "w") as buffer: ... with pd.ExcelWriter(buffer) as writer: @@ -1084,7 +1108,9 @@ class ExcelWriter(Generic[_WorkbookT]): >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... engine="xlsxwriter", - ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}} + ... engine_kwargs={ + ... {"options": {{"nan_inf_to_errors": True}}} + ... }, ... ) as writer: ... df.to_excel(writer) # doctest: +SKIP @@ -1095,9 +1121,11 @@ class ExcelWriter(Generic[_WorkbookT]): ... "path_to_file.xlsx", ... engine="openpyxl", ... mode="a", - ... engine_kwargs={{"keep_vba": True}} + ... engine_kwargs={{"keep_vba": True}}, ... ) as writer: - ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP + ... df.to_excel( + ... writer, sheet_name="Sheet2" + ... ) # doctest: +SKIP """ # Defining an ExcelWriter implementation (see abstract methods for more...) @@ -1497,7 +1525,7 @@ class ExcelFile: Examples -------- - >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP + >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP >>> with pd.ExcelFile("myfile.xls") as xls: # doctest: +SKIP ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP """ @@ -1620,9 +1648,11 @@ def parse( Examples -------- - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) - >>> df.to_excel('myfile.xlsx') # doctest: +SKIP - >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP + >>> df = pd.DataFrame( + ... [[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"] + ... ) + >>> df.to_excel("myfile.xlsx") # doctest: +SKIP + >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP >>> file.parse() # doctest: +SKIP """ return self._reader.parse( diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 1b30d0f759634..1458825f60349 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -75,7 +75,8 @@ def load_workbook( from python_calamine import load_workbook return load_workbook( - filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] + filepath_or_buffer, + **engine_kwargs, # type: ignore[arg-type] ) @property diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index f7a1fcb8052e3..95d43f60a22c5 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -143,9 +143,9 @@ def _range2cols(areas: str) -> list[int]: Examples -------- - >>> _range2cols('A:E') + >>> _range2cols("A:E") [0, 1, 2, 3, 4] - >>> _range2cols('A,C,Z:AB') + >>> _range2cols("A,C,Z:AB") [0, 2, 25, 26, 27] """ cols: list[int] = [] diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d0aaf83b84cb2..9a67b300eb3ed 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -107,7 +107,9 @@ def read_feather( Examples -------- - >>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP + >>> df = pd.read_feather( + ... "path/to/file.feather" + ... ) # doctest: +SKIP """ import_optional_dependency("pyarrow") from pyarrow import feather diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index ccce60c00a9e0..c3baf115eea56 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -244,14 +244,17 @@ def __call__( Examples -------- >>> resolve = CSSResolver() - >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'} - >>> out = resolve(''' + >>> inherited = {"font-family": "serif", "font-weight": "bold"} + >>> out = resolve( + ... ''' ... border-color: BLUE RED; ... font-size: 1em; ... font-size: 2em; ... font-weight: normal; ... font-weight: inherit; - ... ''', inherited) + ... ''', + ... inherited, + ... ) >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE [('border-bottom-color', 'blue'), ('border-left-color', 'red'), diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 445d60be1dc15..7fc632e798c53 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -361,7 +361,9 @@ def get_dataframe_repr_params() -> dict[str, Any]: >>> import pandas as pd >>> >>> df = pd.DataFrame([[1, 2], [3, 4]]) - >>> repr_params = pd.io.formats.format.get_dataframe_repr_params() + >>> repr_params = ( + ... pd.io.formats.format.get_dataframe_repr_params() + ... ) >>> repr(df) == df.to_string(**repr_params) True """ @@ -1568,7 +1570,9 @@ def format_percentiles( -------- Keeps all entries different after rounding: - >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + >>> format_percentiles( + ... [0.01999, 0.02001, 0.5, 0.666666, 0.9999] + ... ) ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] No element is rounded to 0% or 100% (unless already equal to it). diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 552affbd053f2..23aa346c75f55 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -334,10 +334,10 @@ def _sizeof_fmt(num: float, size_qualifier: str) -> str: Examples -------- - >>> _sizeof_fmt(23028, '') + >>> _sizeof_fmt(23028, "") '22.5 KB' - >>> _sizeof_fmt(23028, '+') + >>> _sizeof_fmt(23028, "+") '22.5+ KB' """ for x in ["bytes", "KB", "MB", "GB", "TB"]: diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 2cc9368f8846a..45465eb51c975 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -474,7 +474,7 @@ def _justify( Examples -------- - >>> _justify([['a', 'b']], [['abc', 'abcd']]) + >>> _justify([["a", "b"]], [["abc", "abcd"]]) ([(' a', ' b')], [('abc', 'abcd')]) """ combined = head + tail diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b62f7581ac220..d0e6f8b7ff9a9 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -241,10 +241,14 @@ class Styler(StylerRenderer): Examples -------- - >>> df = pd.DataFrame([[1.0, 2.0, 3.0], [4, 5, 6]], index=['a', 'b'], - ... columns=['A', 'B', 'C']) - >>> pd.io.formats.style.Styler(df, precision=2, - ... caption="My table") # doctest: +SKIP + >>> df = pd.DataFrame( + ... [[1.0, 2.0, 3.0], [4, 5, 6]], + ... index=["a", "b"], + ... columns=["A", "B", "C"], + ... ) + >>> pd.io.formats.style.Styler( + ... df, precision=2, caption="My table" + ... ) # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -351,10 +355,14 @@ def concat(self, other: Styler) -> Styler: A common use case is adding totals rows, or otherwise, via methods calculated in ``DataFrame.agg``. - >>> df = pd.DataFrame([[4, 6], [1, 9], [3, 4], [5, 5], [9, 6]], - ... columns=["Mike", "Jim"], - ... index=["Mon", "Tue", "Wed", "Thurs", "Fri"]) - >>> styler = df.style.concat(df.agg(["sum"]).style) # doctest: +SKIP + >>> df = pd.DataFrame( + ... [[4, 6], [1, 9], [3, 4], [5, 5], [9, 6]], + ... columns=["Mike", "Jim"], + ... index=["Mon", "Tue", "Wed", "Thurs", "Fri"], + ... ) + >>> styler = df.style.concat( + ... df.agg(["sum"]).style + ... ) # doctest: +SKIP .. figure:: ../../_static/style/footer_simple.png @@ -363,14 +371,27 @@ def concat(self, other: Styler) -> Styler: >>> descriptors = df.agg(["sum", "mean", lambda s: s.dtype]) >>> descriptors.index = ["Total", "Average", "dtype"] - >>> other = (descriptors.style - ... .highlight_max(axis=1, subset=(["Total", "Average"], slice(None))) - ... .format(subset=("Average", slice(None)), precision=2, decimal=",") - ... .map(lambda v: "font-weight: bold;")) - >>> styler = (df.style - ... .highlight_max(color="salmon") - ... .set_table_styles([{"selector": ".foot_row0", - ... "props": "border-top: 1px solid black;"}])) + >>> other = ( + ... descriptors.style.highlight_max( + ... axis=1, subset=(["Total", "Average"], slice(None)) + ... ) + ... .format( + ... subset=("Average", slice(None)), + ... precision=2, + ... decimal=",", + ... ) + ... .map(lambda v: "font-weight: bold;") + ... ) + >>> styler = df.style.highlight_max( + ... color="salmon" + ... ).set_table_styles( + ... [ + ... { + ... "selector": ".foot_row0", + ... "props": "border-top: 1px solid black;", + ... } + ... ] + ... ) >>> styler.concat(other) # doctest: +SKIP .. figure:: ../../_static/style/footer_extended.png @@ -378,10 +399,14 @@ def concat(self, other: Styler) -> Styler: When ``other`` has fewer index levels than the original Styler it is possible to extend the index in ``other``, with placeholder levels. - >>> df = pd.DataFrame([[1], [2]], - ... index=pd.MultiIndex.from_product([[0], [1, 2]])) + >>> df = pd.DataFrame( + ... [[1], [2]], + ... index=pd.MultiIndex.from_product([[0], [1, 2]]), + ... ) >>> descriptors = df.agg(["sum"]) - >>> descriptors.index = pd.MultiIndex.from_product([[""], descriptors.index]) + >>> descriptors.index = pd.MultiIndex.from_product( + ... [[""], descriptors.index] + ... ) >>> df.style.concat(descriptors.style) # doctest: +SKIP """ if not isinstance(other, Styler): @@ -472,18 +497,28 @@ def set_tooltips( >>> df = pd.DataFrame(data=[[0, 1], [2, 3]]) >>> ttips = pd.DataFrame( - ... data=[["Min", ""], [np.nan, "Max"]], columns=df.columns, index=df.index + ... data=[["Min", ""], [np.nan, "Max"]], + ... columns=df.columns, + ... index=df.index, ... ) >>> s = df.style.set_tooltips(ttips).to_html() Optionally controlling the tooltip visual display - >>> df.style.set_tooltips(ttips, css_class='tt-add', props=[ - ... ('visibility', 'hidden'), - ... ('position', 'absolute'), - ... ('z-index', 1)]) # doctest: +SKIP - >>> df.style.set_tooltips(ttips, css_class='tt-add', - ... props='visibility:hidden; position:absolute; z-index:1;') + >>> df.style.set_tooltips( + ... ttips, + ... css_class="tt-add", + ... props=[ + ... ("visibility", "hidden"), + ... ("position", "absolute"), + ... ("z-index", 1), + ... ], + ... ) # doctest: +SKIP + >>> df.style.set_tooltips( + ... ttips, + ... css_class="tt-add", + ... props="visibility:hidden; position:absolute; z-index:1;", + ... ) ... # doctest: +SKIP """ if not self.cell_ids: @@ -1307,7 +1342,7 @@ def to_html( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> print(df.style.to_html()) # doctest: +SKIP @@ -1434,7 +1469,7 @@ def to_string( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.style.to_string() ' A B\\n0 1 3\\n1 2 4\\n' """ @@ -1486,20 +1521,32 @@ def set_td_classes(self, classes: DataFrame) -> Styler: Examples -------- - >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - >>> classes = pd.DataFrame([ - ... ["min-val red", "", "blue"], - ... ["red", None, "blue max-val"] - ... ], index=df.index, columns=df.columns) + >>> df = pd.DataFrame( + ... data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"] + ... ) + >>> classes = pd.DataFrame( + ... [ + ... ["min-val red", "", "blue"], + ... ["red", None, "blue max-val"], + ... ], + ... index=df.index, + ... columns=df.columns, + ... ) >>> df.style.set_td_classes(classes) # doctest: +SKIP Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the underlying, - >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"], - ... columns=[["level0", "level0"], ["level1a", "level1b"]]) - >>> classes = pd.DataFrame(["min-val"], index=["a"], - ... columns=[["level0"],["level1a"]]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4]], + ... index=["a", "b"], + ... columns=[["level0", "level0"], ["level1a", "level1b"]], + ... ) + >>> classes = pd.DataFrame( + ... ["min-val"], + ... index=["a"], + ... columns=[["level0"], ["level1a"]], + ... ) >>> df.style.set_td_classes(classes) # doctest: +SKIP Form of the output with new additional css classes, @@ -1671,11 +1718,11 @@ def clear(self) -> None: Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, np.nan]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, np.nan]}) After any added style: - >>> df.style.highlight_null(color='yellow') # doctest: +SKIP + >>> df.style.highlight_null(color="yellow") # doctest: +SKIP Remove it with: @@ -1810,31 +1857,51 @@ def apply( Examples -------- >>> def highlight_max(x, color): - ... return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None) + ... return np.where( + ... x == np.nanmax(x.to_numpy()), + ... f"color: {color};", + ... None, + ... ) >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.apply(highlight_max, color='red') # doctest: +SKIP - >>> df.style.apply(highlight_max, color='blue', axis=1) # doctest: +SKIP - >>> df.style.apply(highlight_max, color='green', axis=None) # doctest: +SKIP + >>> df.style.apply(highlight_max, color="red") # doctest: +SKIP + >>> df.style.apply( + ... highlight_max, color="blue", axis=1 + ... ) # doctest: +SKIP + >>> df.style.apply( + ... highlight_max, color="green", axis=None + ... ) # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns - >>> df.style.apply(highlight_max, color='red', subset="A") + >>> df.style.apply(highlight_max, color="red", subset="A") ... # doctest: +SKIP - >>> df.style.apply(highlight_max, color='red', subset=["A", "B"]) + >>> df.style.apply( + ... highlight_max, color="red", subset=["A", "B"] + ... ) ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.apply(highlight_max, color='red', subset=([0, 1, 2], slice(None))) + >>> df.style.apply( + ... highlight_max, + ... color="red", + ... subset=([0, 1, 2], slice(None)), + ... ) ... # doctest: +SKIP - >>> df.style.apply(highlight_max, color='red', subset=(slice(0, 5, 2), "A")) + >>> df.style.apply( + ... highlight_max, color="red", subset=(slice(0, 5, 2), "A") + ... ) ... # doctest: +SKIP Using a function which returns a Series / DataFrame of unequal length but containing valid index labels - >>> df = pd.DataFrame([[1, 2], [3, 4], [4, 6]], index=["A1", "A2", "Total"]) - >>> total_style = pd.Series("font-weight: bold;", index=["Total"]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4], [4, 6]], index=["A1", "A2", "Total"] + ... ) + >>> total_style = pd.Series( + ... "font-weight: bold;", index=["Total"] + ... ) >>> df.style.apply(lambda s: total_style) # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for @@ -1927,7 +1994,7 @@ def apply_index( -------- Basic usage to conditionally highlight values in the index. - >>> df = pd.DataFrame([[1,2], [3,4]], index=["A", "B"]) + >>> df = pd.DataFrame([[1, 2], [3, 4]], index=["A", "B"]) >>> def color_b(s): ... return {ret} >>> df.style.{this}_index(color_b) # doctest: +SKIP @@ -1936,7 +2003,9 @@ def apply_index( Selectively applying to specific levels of MultiIndex columns. - >>> midx = pd.MultiIndex.from_product([['ix', 'jy'], [0, 1], ['x3', 'z4']]) + >>> midx = pd.MultiIndex.from_product( + ... [["ix", "jy"], [0, 1], ["x3", "z4"]] + ... ) >>> df = pd.DataFrame([np.arange(8)], columns=midx) >>> def highlight_x({var}): ... return {ret2} @@ -2064,21 +2133,28 @@ def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: >>> def color_negative(v, color): ... return f"color: {color};" if v < 0 else None >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.map(color_negative, color='red') # doctest: +SKIP + >>> df.style.map(color_negative, color="red") # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns - >>> df.style.map(color_negative, color='red', subset="A") - ... # doctest: +SKIP - >>> df.style.map(color_negative, color='red', subset=["A", "B"]) - ... # doctest: +SKIP + >>> df.style.map(color_negative, color="red", subset="A") + ... # doctest: +SKIP + >>> df.style.map(color_negative, color="red", subset=["A", "B"]) + ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.map(color_negative, color='red', - ... subset=([0,1,2], slice(None))) # doctest: +SKIP - >>> df.style.map(color_negative, color='red', subset=(slice(0,5,2), "A")) - ... # doctest: +SKIP + >>> df.style.map( + ... color_negative, + ... color="red", + ... subset=([0, 1, 2], slice(None)), + ... ) # doctest: +SKIP + >>> df.style.map( + ... color_negative, + ... color="red", + ... subset=(slice(0, 5, 2), "A"), + ... ) + ... # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. @@ -2142,7 +2218,9 @@ def set_table_attributes(self, attributes: str) -> Styler: Examples -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) - >>> df.style.set_table_attributes('class="pure-table"') # doctest: +SKIP + >>> df.style.set_table_attributes( + ... 'class="pure-table"' + ... ) # doctest: +SKIP # ...
... """ self.table_attributes = attributes @@ -2293,7 +2371,9 @@ def set_uuid(self, uuid: str) -> Styler: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], index=['A', 'B'], columns=['c1', 'c2']) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4]], index=["A", "B"], columns=["c1", "c2"] + ... ) You can get the `id` attributes with the following: @@ -2327,7 +2407,7 @@ def set_caption(self, caption: str | tuple | list) -> Styler: Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.style.set_caption("test") # doctest: +SKIP Please see: @@ -2383,7 +2463,7 @@ def set_sticky( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.style.set_sticky(axis="index") # doctest: +SKIP Please see: @@ -2544,49 +2624,71 @@ def set_table_styles( .. code-block:: python - css_class_names = {"row_heading": "row_heading", - "col_heading": "col_heading", - "index_name": "index_name", - "col": "col", - "row": "row", - "col_trim": "col_trim", - "row_trim": "row_trim", - "level": "level", - "data": "data", - "blank": "blank", - "foot": "foot"} + css_class_names = { + "row_heading": "row_heading", + "col_heading": "col_heading", + "index_name": "index_name", + "col": "col", + "row": "row", + "col_trim": "col_trim", + "row_trim": "row_trim", + "level": "level", + "data": "data", + "blank": "blank", + "foot": "foot", + } Examples -------- - >>> df = pd.DataFrame(np.random.randn(10, 4), - ... columns=['A', 'B', 'C', 'D']) + >>> df = pd.DataFrame( + ... np.random.randn(10, 4), columns=["A", "B", "C", "D"] + ... ) >>> df.style.set_table_styles( - ... [{'selector': 'tr:hover', - ... 'props': [('background-color', 'yellow')]}] + ... [ + ... { + ... "selector": "tr:hover", + ... "props": [("background-color", "yellow")], + ... } + ... ] ... ) # doctest: +SKIP Or with CSS strings >>> df.style.set_table_styles( - ... [{'selector': 'tr:hover', - ... 'props': 'background-color: yellow; font-size: 1em;'}] + ... [ + ... { + ... "selector": "tr:hover", + ... "props": "background-color: yellow; font-size: 1em;", + ... } + ... ] ... ) # doctest: +SKIP Adding column styling by name - >>> df.style.set_table_styles({ - ... 'A': [{'selector': '', - ... 'props': [('color', 'red')]}], - ... 'B': [{'selector': 'td', - ... 'props': 'color: blue;'}] - ... }, overwrite=False) # doctest: +SKIP + >>> df.style.set_table_styles( + ... { + ... "A": [ + ... {"selector": "", "props": [("color", "red")]} + ... ], + ... "B": [{"selector": "td", "props": "color: blue;"}], + ... }, + ... overwrite=False, + ... ) # doctest: +SKIP Adding row styling - >>> df.style.set_table_styles({ - ... 0: [{'selector': 'td:hover', - ... 'props': [('font-size', '25px')]}] - ... }, axis=1, overwrite=False) # doctest: +SKIP + >>> df.style.set_table_styles( + ... { + ... 0: [ + ... { + ... "selector": "td:hover", + ... "props": [("font-size", "25px")], + ... } + ... ] + ... }, + ... axis=1, + ... overwrite=False, + ... ) # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. @@ -2717,15 +2819,21 @@ def hide( -------- Simple application hiding specific rows: - >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4], [5, 6]], index=["a", "b", "c"] + ... ) >>> df.style.hide(["a", "b"]) # doctest: +SKIP 0 1 c 5 6 Hide the index and retain the data values: - >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) - >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) + >>> midx = pd.MultiIndex.from_product( + ... [["x", "y"], ["a", "b", "c"]] + ... ) + >>> df = pd.DataFrame( + ... np.random.randn(6, 6), index=midx, columns=midx + ... ) >>> df.style.format("{:.1f}").hide() # doctest: +SKIP x y a b c a b c @@ -2738,8 +2846,10 @@ def hide( Hide specific rows in a MultiIndex but retain the index: - >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])) - ... # doctest: +SKIP + >>> df.style.format("{:.1f}").hide( + ... subset=(slice(None), ["a", "c"]) + ... ) + ... # doctest: +SKIP x y a b c a b c x b 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -2747,8 +2857,10 @@ def hide( Hide specific rows and the index through chaining: - >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])).hide() - ... # doctest: +SKIP + >>> df.style.format("{:.1f}").hide( + ... subset=(slice(None), ["a", "c"]) + ... ).hide() + ... # doctest: +SKIP x y a b c a b c 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -2769,7 +2881,9 @@ def hide( Hiding just the index level names: >>> df.index.names = ["lev0", "lev1"] - >>> df.style.format("{:,.1f}").hide(names=True) # doctest: +SKIP + >>> df.style.format("{:,.1f}").hide( + ... names=True + ... ) # doctest: +SKIP x y a b c a b c x a 0.1 0.0 0.4 1.3 0.6 -1.4 @@ -2915,10 +3029,14 @@ def background_gradient( Examples -------- - >>> df = pd.DataFrame(columns=["City", "Temp (c)", "Rain (mm)", "Wind (m/s)"], - ... data=[["Stockholm", 21.6, 5.0, 3.2], - ... ["Oslo", 22.4, 13.3, 3.1], - ... ["Copenhagen", 24.5, 0.0, 6.7]]) + >>> df = pd.DataFrame( + ... columns=["City", "Temp (c)", "Rain (mm)", "Wind (m/s)"], + ... data=[ + ... ["Stockholm", 21.6, 5.0, 3.2], + ... ["Oslo", 22.4, 13.3, 3.1], + ... ["Copenhagen", 24.5, 0.0, 6.7], + ... ], + ... ) Shading the values column-wise, with ``axis=0``, preselecting numeric columns @@ -2954,7 +3072,7 @@ def background_gradient( Setting the gradient map for a dataframe (i.e. ``axis=None``), we need to explicitly state ``subset`` to match the ``gmap`` shape - >>> gmap = np.array([[1,2,3], [2,3,4], [3,4,5]]) + >>> gmap = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]]) >>> df.style.{name}_gradient(axis=None, gmap=gmap, ... cmap='YlOrRd', subset=['Temp (c)', 'Rain (mm)', 'Wind (m/s)'] ... ) # doctest: +SKIP @@ -3035,8 +3153,12 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: Examples -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) - >>> df.style.set_properties(color="white", align="right") # doctest: +SKIP - >>> df.style.set_properties(**{'background-color': 'yellow'}) # doctest: +SKIP + >>> df.style.set_properties( + ... color="white", align="right" + ... ) # doctest: +SKIP + >>> df.style.set_properties( + ... **{"background-color": "yellow"} + ... ) # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. @@ -3132,8 +3254,8 @@ def bar( # pylint: disable=disallowed-name Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) - >>> df.style.bar(subset=['A'], color='gray') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) + >>> df.style.bar(subset=["A"], color="gray") # doctest: +SKIP """ if color is None and cmap is None: color = "#d65f5f" @@ -3211,8 +3333,8 @@ def highlight_null( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, np.nan]}) - >>> df.style.highlight_null(color='yellow') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, np.nan]}) + >>> df.style.highlight_null(color="yellow") # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -3265,8 +3387,8 @@ def highlight_max( Examples -------- - >>> df = pd.DataFrame({'A': [2, 1], 'B': [3, 4]}) - >>> df.style.highlight_max(color='yellow') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [2, 1], "B": [3, 4]}) + >>> df.style.highlight_max(color="yellow") # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -3321,8 +3443,8 @@ def highlight_min( Examples -------- - >>> df = pd.DataFrame({'A': [2, 1], 'B': [3, 4]}) - >>> df.style.highlight_min(color='yellow') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [2, 1], "B": [3, 4]}) + >>> df.style.highlight_min(color="yellow") # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -3401,35 +3523,50 @@ def highlight_between( -------- Basic usage - >>> df = pd.DataFrame({ - ... 'One': [1.2, 1.6, 1.5], - ... 'Two': [2.9, 2.1, 2.5], - ... 'Three': [3.1, 3.2, 3.8], - ... }) - >>> df.style.highlight_between(left=2.1, right=2.9) # doctest: +SKIP + >>> df = pd.DataFrame( + ... { + ... "One": [1.2, 1.6, 1.5], + ... "Two": [2.9, 2.1, 2.5], + ... "Three": [3.1, 3.2, 3.8], + ... } + ... ) + >>> df.style.highlight_between( + ... left=2.1, right=2.9 + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_basic.png Using a range input sequence along an ``axis``, in this case setting a ``left`` and ``right`` for each column individually - >>> df.style.highlight_between(left=[1.4, 2.4, 3.4], right=[1.6, 2.6, 3.6], - ... axis=1, color="#fffd75") # doctest: +SKIP + >>> df.style.highlight_between( + ... left=[1.4, 2.4, 3.4], + ... right=[1.6, 2.6, 3.6], + ... axis=1, + ... color="#fffd75", + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_seq.png Using ``axis=None`` and providing the ``left`` argument as an array that matches the input DataFrame, with a constant ``right`` - >>> df.style.highlight_between(left=[[2,2,3],[2,2,3],[3,3,3]], right=3.5, - ... axis=None, color="#fffd75") # doctest: +SKIP + >>> df.style.highlight_between( + ... left=[[2, 2, 3], [2, 2, 3], [3, 3, 3]], + ... right=3.5, + ... axis=None, + ... color="#fffd75", + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_axNone.png Using ``props`` instead of default background coloring - >>> df.style.highlight_between(left=1.5, right=3.5, - ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP + >>> df.style.highlight_between( + ... left=1.5, + ... right=3.5, + ... props="font-weight:bold;color:#e83e8c", + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_props.png """ @@ -3503,23 +3640,31 @@ def highlight_quantile( -------- Using ``axis=None`` and apply a quantile to all collective data - >>> df = pd.DataFrame(np.arange(10).reshape(2,5) + 1) - >>> df.style.highlight_quantile(axis=None, q_left=0.8, color="#fffd75") - ... # doctest: +SKIP + >>> df = pd.DataFrame(np.arange(10).reshape(2, 5) + 1) + >>> df.style.highlight_quantile( + ... axis=None, q_left=0.8, color="#fffd75" + ... ) + ... # doctest: +SKIP .. figure:: ../../_static/style/hq_axNone.png Or highlight quantiles row-wise or column-wise, in this case by row-wise - >>> df.style.highlight_quantile(axis=1, q_left=0.8, color="#fffd75") - ... # doctest: +SKIP + >>> df.style.highlight_quantile( + ... axis=1, q_left=0.8, color="#fffd75" + ... ) + ... # doctest: +SKIP .. figure:: ../../_static/style/hq_ax1.png Use ``props`` instead of default background coloring - >>> df.style.highlight_quantile(axis=None, q_left=0.2, q_right=0.8, - ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP + >>> df.style.highlight_quantile( + ... axis=None, + ... q_left=0.2, + ... q_right=0.8, + ... props="font-weight:bold;color:#e83e8c", + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hq_props.png """ @@ -3591,9 +3736,10 @@ def from_custom_template( Examples -------- >>> from pandas.io.formats.style import Styler - >>> EasyStyler = Styler.from_custom_template("path/to/template", - ... "template.tpl", - ... ) # doctest: +SKIP + >>> EasyStyler = Styler.from_custom_template( + ... "path/to/template", + ... "template.tpl", + ... ) # doctest: +SKIP >>> df = pd.DataFrame({"A": [1, 2]}) >>> EasyStyler(df) # doctest: +SKIP @@ -3654,9 +3800,11 @@ def pipe(self, func: Callable, *args, **kwargs): .. code-block:: python - (df.style.format(precision=3) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c)) + ( + df.style.format(precision=3) + .pipe(g, arg1=a) + .pipe(f, arg2=b, arg3=c) + ) In particular, this allows users to define functions that take a styler object, along with other parameters, and return the styler after @@ -3671,22 +3819,30 @@ def pipe(self, func: Callable, *args, **kwargs): A common usage pattern is to pre-define styling operations which can be easily applied to a generic styler in a single ``pipe`` call. - >>> def some_highlights(styler, min_color="red", max_color="blue"): - ... styler.highlight_min(color=min_color, axis=None) - ... styler.highlight_max(color=max_color, axis=None) - ... styler.highlight_null() - ... return styler - >>> df = pd.DataFrame([[1, 2, 3, pd.NA], [pd.NA, 4, 5, 6]], dtype="Int64") - >>> df.style.pipe(some_highlights, min_color="green") # doctest: +SKIP + >>> def some_highlights( + ... styler, min_color="red", max_color="blue" + ... ): + ... styler.highlight_min(color=min_color, axis=None) + ... styler.highlight_max(color=max_color, axis=None) + ... styler.highlight_null() + ... return styler + >>> df = pd.DataFrame( + ... [[1, 2, 3, pd.NA], [pd.NA, 4, 5, 6]], dtype="Int64" + ... ) + >>> df.style.pipe( + ... some_highlights, min_color="green" + ... ) # doctest: +SKIP .. figure:: ../../_static/style/df_pipe_hl.png Since the method returns a ``Styler`` object it can be chained with other methods as if applying the underlying highlighters directly. - >>> (df.style.format("{:.1f}") - ... .pipe(some_highlights, min_color="green") - ... .highlight_between(left=2, right=5)) # doctest: +SKIP + >>> ( + ... df.style.format("{:.1f}") + ... .pipe(some_highlights, min_color="green") + ... .highlight_between(left=2, right=5) + ... ) # doctest: +SKIP .. figure:: ../../_static/style/df_pipe_hl2.png @@ -3705,10 +3861,13 @@ def pipe(self, func: Callable, *args, **kwargs): >>> def highlight_last_level(styler): ... return styler.apply_index( - ... lambda v: "background-color: pink; color: yellow", axis="columns", - ... level=styler.columns.nlevels-1 + ... lambda v: "background-color: pink; color: yellow", + ... axis="columns", + ... level=styler.columns.nlevels - 1, ... ) # doctest: +SKIP - >>> df.columns = pd.MultiIndex.from_product([["A", "B"], ["X", "Y"]]) + >>> df.columns = pd.MultiIndex.from_product( + ... [["A", "B"], ["X", "Y"]] + ... ) >>> df.style.pipe(highlight_last_level) # doctest: +SKIP .. figure:: ../../_static/style/df_pipe_applymap.png @@ -3721,10 +3880,17 @@ def pipe(self, func: Callable, *args, **kwargs): >>> def highlight_header_missing(styler, level): ... def dynamic_highlight(s): ... return np.where( - ... styler.data.isna().any(), "background-color: red;", "" + ... styler.data.isna().any(), + ... "background-color: red;", + ... "", ... ) - ... return styler.apply_index(dynamic_highlight, axis=1, level=level) - >>> df.style.pipe(highlight_header_missing, level=1) # doctest: +SKIP + ... + ... return styler.apply_index( + ... dynamic_highlight, axis=1, level=level + ... ) + >>> df.style.pipe( + ... highlight_header_missing, level=1 + ... ) # doctest: +SKIP .. figure:: ../../_static/style/df_pipe_applydata.png """ diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 416b263ba8497..997965d55a203 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1449,10 +1449,10 @@ def relabel_index( # relabel first, then hide df = pd.DataFrame({"col": ["a", "b", "c"]}) - df.style.relabel_index(["A", "B", "C"]).hide([0,1]) + df.style.relabel_index(["A", "B", "C"]).hide([0, 1]) # hide first, then relabel df = pd.DataFrame({"col": ["a", "b", "c"]}) - df.style.hide([0,1]).relabel_index(["C"]) + df.style.hide([0, 1]).relabel_index(["C"]) This method should be used, rather than :meth:`Styler.format_index`, in one of the following cases (see examples): @@ -1475,7 +1475,7 @@ def relabel_index( Chaining with pre-hidden elements - >>> df.style.hide([0,1]).relabel_index(["C"]) # doctest: +SKIP + >>> df.style.hide([0, 1]).relabel_index(["C"]) # doctest: +SKIP col C c @@ -1493,19 +1493,26 @@ def relabel_index( 1 5 1 0 6 1 7 - >>> styler.hide((midx.get_level_values(0)==0)|(midx.get_level_values(1)==0)) - ... # doctest: +SKIP - >>> styler.hide(level=[0,1]) # doctest: +SKIP - >>> styler.relabel_index(["binary6", "binary7"]) # doctest: +SKIP + >>> styler.hide( + ... (midx.get_level_values(0) == 0) + ... | (midx.get_level_values(1) == 0) + ... ) + ... # doctest: +SKIP + >>> styler.hide(level=[0, 1]) # doctest: +SKIP + >>> styler.relabel_index( + ... ["binary6", "binary7"] + ... ) # doctest: +SKIP col binary6 6 binary7 7 We can also achieve the above by indexing first and then re-labeling - >>> styler = df.loc[[(1,1,0), (1,1,1)]].style - >>> styler.hide(level=[0,1]).relabel_index(["binary6", "binary7"]) - ... # doctest: +SKIP + >>> styler = df.loc[[(1, 1, 0), (1, 1, 1)]].style + >>> styler.hide(level=[0, 1]).relabel_index( + ... ["binary6", "binary7"] + ... ) + ... # doctest: +SKIP col binary6 6 binary7 7 @@ -1516,9 +1523,11 @@ def relabel_index( brackets if the string if pre-formatted), >>> df = pd.DataFrame({"samples": np.random.rand(10)}) - >>> styler = df.loc[np.random.randint(0,10,3)].style - >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) - ... # doctest: +SKIP + >>> styler = df.loc[np.random.randint(0, 10, 3)].style + >>> styler.relabel_index( + ... [f"sample{i+1} ({{}})" for i in range(3)] + ... ) + ... # doctest: +SKIP samples sample1 (5) 0.315811 sample2 (0) 0.495941 @@ -2151,10 +2160,15 @@ def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | N Examples -------- - >>> table_styles = [{'selector': 'foo', 'props': [('attr','value')]}, - ... {'selector': 'bar', 'props': [('attr', 'overwritten')]}, - ... {'selector': 'bar', 'props': [('a1', 'baz'), ('a2', 'ignore')]}] - >>> _parse_latex_table_styles(table_styles, selector='bar') + >>> table_styles = [ + ... {"selector": "foo", "props": [("attr", "value")]}, + ... {"selector": "bar", "props": [("attr", "overwritten")]}, + ... { + ... "selector": "bar", + ... "props": [("a1", "baz"), ("a2", "ignore")], + ... }, + ... ] + >>> _parse_latex_table_styles(table_styles, selector="bar") 'baz' Notes @@ -2238,8 +2252,12 @@ def _parse_latex_header_span( Examples -------- - >>> cell = {'cellstyle': '', 'display_value':'text', 'attributes': 'colspan="3"'} - >>> _parse_latex_header_span(cell, 't', 'c') + >>> cell = { + ... "cellstyle": "", + ... "display_value": "text", + ... "attributes": 'colspan="3"', + ... } + >>> _parse_latex_header_span(cell, "t", "c") '\\multicolumn{3}{c}{text}' """ display_val = _parse_latex_cell_styles( diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 350002bf461ff..f54f6733d4698 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -179,10 +179,9 @@ def read_gbq( >>> sql = "SELECT name FROM table_name WHERE state = 'TX' LIMIT 100;" >>> df = pd.read_gbq(sql, dialect="standard") # doctest: +SKIP >>> project_id = "your-project-id" # doctest: +SKIP - >>> df = pd.read_gbq(sql, - ... project_id=project_id, - ... dialect="standard" - ... ) # doctest: +SKIP + >>> df = pd.read_gbq( + ... sql, project_id=project_id, dialect="standard" + ... ) # doctest: +SKIP """ warnings.warn( "read_gbq is deprecated and will be removed in a future version. " diff --git a/pandas/io/html.py b/pandas/io/html.py index 5d5bf079784be..bce753168ee56 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1100,13 +1100,13 @@ def read_html( passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - attrs = {{'id': 'table'}} + attrs = {{"id": "table"}} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - attrs = {{'asdf': 'table'}} + attrs = {{"asdf": "table"}} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index b1e2210f9d894..df29df1ecaa63 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -308,7 +308,10 @@ def json_normalize( ... "name": "Cole Volk", ... "fitness": {"height": 130, "weight": 60}, ... }, - ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "name": "Mark Reg", + ... "fitness": {"height": 130, "weight": 60}, + ... }, ... { ... "id": 2, ... "name": "Faye Raker", @@ -329,7 +332,10 @@ def json_normalize( ... "name": "Cole Volk", ... "fitness": {"height": 130, "weight": 60}, ... }, - ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "name": "Mark Reg", + ... "fitness": {"height": 130, "weight": 60}, + ... }, ... { ... "id": 2, ... "name": "Faye Raker", @@ -364,7 +370,9 @@ def json_normalize( ... }, ... ] >>> result = pd.json_normalize( - ... data, "counties", ["state", "shortname", ["info", "governor"]] + ... data, + ... "counties", + ... ["state", "shortname", ["info", "governor"]], ... ) >>> result name population state shortname info.governor diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 4d9fba72cf173..e8b900a7d6463 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -174,7 +174,9 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: Examples -------- - >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"}) + >>> convert_json_field_to_pandas_type( + ... {"name": "an_int", "type": "integer"} + ... ) 'int64' >>> convert_json_field_to_pandas_type( @@ -187,11 +189,17 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: ... ) CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object) - >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) + >>> convert_json_field_to_pandas_type( + ... {"name": "a_datetime", "type": "datetime"} + ... ) 'datetime64[ns]' >>> convert_json_field_to_pandas_type( - ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"} + ... { + ... "name": "a_datetime_with_tz", + ... "type": "datetime", + ... "tz": "US/Central", + ... } ... ) 'datetime64[ns, US/Central]' """ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 295eb3cbdd500..c0793aa14bf4e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -599,7 +599,7 @@ def read_parquet( -------- >>> original_df = pd.DataFrame( ... {{"foo": range(5), "bar": range(5, 10)}} - ... ) + ... ) >>> original_df foo bar 0 0 5 @@ -619,7 +619,9 @@ def read_parquet( 4 4 9 >>> restored_df.equals(original_df) True - >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"]) + >>> restored_bar = pd.read_parquet( + ... BytesIO(df_parquet_bytes), columns=["bar"] + ... ) >>> restored_bar bar 0 5 @@ -627,7 +629,7 @@ def read_parquet( 2 7 3 8 4 9 - >>> restored_bar.equals(original_df[['bar']]) + >>> restored_bar.equals(original_df[["bar"]]) True The function uses `kwargs` that are passed directly to the engine. @@ -640,7 +642,9 @@ def read_parquet( economical in terms of memory. >>> sel = [("foo", ">", 2)] - >>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel) + >>> restored_part = pd.read_parquet( + ... BytesIO(df_parquet_bytes), filters=sel + ... ) >>> restored_part foo bar 0 3 8 diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 0cd788c5e5739..4feec634bcd10 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -390,7 +390,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: def ensure_dtype_objs( - dtype: DtypeArg | dict[Hashable, DtypeArg] | None + dtype: DtypeArg | dict[Hashable, DtypeArg] | None, ) -> DtypeObj | dict[Hashable, DtypeObj] | None: """ Ensure we have either None, a dtype object, or a dictionary mapping to diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7326ad831ee96..de2682ae22a35 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1411,7 +1411,7 @@ def read_fwf( Examples -------- - >>> pd.read_fwf('data.csv') # doctest: +SKIP + >>> pd.read_fwf("data.csv") # doctest: +SKIP """ # Check input arguments. if colspecs is None and widths is None: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 0dae0e7106b69..37cd0f94e8d3c 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -78,7 +78,9 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df = pd.DataFrame( + ... {{"foo": range(5), "bar": range(5, 10)}} + ... ) # doctest: +SKIP >>> original_df # doctest: +SKIP foo bar 0 0 5 @@ -88,7 +90,9 @@ def to_pickle( 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP - >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df = pd.read_pickle( + ... "./dummy.pkl" + ... ) # doctest: +SKIP >>> unpickled_df # doctest: +SKIP foo bar 0 0 5 @@ -96,7 +100,7 @@ def to_pickle( 2 2 7 3 3 8 4 4 9 - """ # noqa: E501 + """ if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL @@ -162,7 +166,7 @@ def read_pickle( -------- >>> original_df = pd.DataFrame( ... {{"foo": range(5), "bar": range(5, 10)}} - ... ) # doctest: +SKIP + ... ) # doctest: +SKIP >>> original_df # doctest: +SKIP foo bar 0 0 5 @@ -172,7 +176,9 @@ def read_pickle( 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP - >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP + >>> unpickled_df = pd.read_pickle( + ... "./dummy.pkl" + ... ) # doctest: +SKIP >>> unpickled_df # doctest: +SKIP foo bar 0 0 5 diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 50611197ad7dd..5f50aa2951b3b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -388,9 +388,11 @@ def read_hdf( Examples -------- - >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP - >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP - >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP + >>> df = pd.DataFrame( + ... [[1, 1.0, "a"]], columns=["x", "y", "z"] + ... ) # doctest: +SKIP + >>> df.to_hdf("./store.h5", "data") # doctest: +SKIP + >>> reread = pd.read_hdf("./store.h5") # doctest: +SKIP """ if mode not in ["r", "r+", "a"]: raise ValueError( @@ -531,9 +533,9 @@ class HDFStore: Examples -------- >>> bar = pd.DataFrame(np.random.randn(10, 4)) - >>> store = pd.HDFStore('test.h5') - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + >>> store = pd.HDFStore("test.h5") + >>> store["foo"] = bar # write to HDF5 + >>> bar = store["foo"] # retrieve >>> store.close() **Create or load HDF5 file in-memory** @@ -543,9 +545,9 @@ class HDFStore: written when closed: >>> bar = pd.DataFrame(np.random.randn(10, 4)) - >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') - >>> store['foo'] = bar - >>> store.close() # only now, data is written to disk + >>> store = pd.HDFStore("test.h5", driver="H5FD_CORE") + >>> store["foo"] = bar + >>> store.close() # only now, data is written to disk """ _handle: File | None @@ -669,10 +671,10 @@ def keys(self, include: str = "pandas") -> list[str]: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP - >>> store.get('data') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP + >>> store.get("data") # doctest: +SKIP >>> print(store.keys()) # doctest: +SKIP ['/data1', '/data2'] >>> store.close() # doctest: +SKIP @@ -798,10 +800,10 @@ def get(self, key: str): Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP - >>> store.get('data') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP + >>> store.get("data") # doctest: +SKIP >>> store.close() # doctest: +SKIP """ with patch_pickle(): @@ -860,17 +862,19 @@ def select( Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP - >>> store.get('data') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP + >>> store.get("data") # doctest: +SKIP >>> print(store.keys()) # doctest: +SKIP ['/data1', '/data2'] - >>> store.select('/data1') # doctest: +SKIP + >>> store.select("/data1") # doctest: +SKIP A B 0 1 2 1 3 4 - >>> store.select('/data1', where='columns == A') # doctest: +SKIP + >>> store.select( + ... "/data1", where="columns == A" + ... ) # doctest: +SKIP A 0 1 1 3 @@ -1150,9 +1154,9 @@ def put( Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1292,11 +1296,11 @@ def append( Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df1, format='table') # doctest: +SKIP - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) - >>> store.append('data', df2) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df1, format="table") # doctest: +SKIP + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"]) + >>> store.append("data", df2) # doctest: +SKIP >>> store.close() # doctest: +SKIP A B 0 1 2 @@ -1483,9 +1487,9 @@ def groups(self) -> list: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP >>> print(store.groups()) # doctest: +SKIP >>> store.close() # doctest: +SKIP [/data (Group) '' @@ -1538,11 +1542,11 @@ def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df1, format='table') # doctest: +SKIP - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) - >>> store.append('data', df2) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df1, format="table") # doctest: +SKIP + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"]) + >>> store.append("data", df2) # doctest: +SKIP >>> store.close() # doctest: +SKIP >>> for group in store.walk(): # doctest: +SKIP ... print(group) # doctest: +SKIP @@ -1664,9 +1668,9 @@ def info(self) -> str: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP >>> print(store.info()) # doctest: +SKIP >>> store.close() # doctest: +SKIP @@ -2170,9 +2174,7 @@ def convert( # "Union[Type[Index], Type[DatetimeIndex]]") factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment] x, freq=kwds.get("freq", None) - )._rename( - kwds["name"] - ) + )._rename(kwds["name"]) # making an Index instance could throw a number of different errors try: @@ -3181,7 +3183,9 @@ def write_array( # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "asi8" self._handle.create_array( - self.group, key, value.asi8 # type: ignore[union-attr] + self.group, + key, + value.asi8, # type: ignore[union-attr] ) node = getattr(self.group, key) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 12118d1488932..501b2889e11b2 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -365,7 +365,9 @@ def read_sql_table( Examples -------- - >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP + >>> pd.read_sql_table( + ... "table_name", "postgres:///db_name" + ... ) # doctest:+SKIP """ check_dtype_backend(dtype_backend) @@ -504,7 +506,9 @@ def read_sql_query( Examples -------- >>> from sqlalchemy import create_engine # doctest: +SKIP - >>> engine = create_engine("sqlite:///database.db") # doctest: +SKIP + >>> engine = create_engine( + ... "sqlite:///database.db" + ... ) # doctest: +SKIP >>> with engine.connect() as conn, conn.begin(): # doctest: +SKIP ... data = pd.read_sql_table("data", conn) # doctest: +SKIP """ @@ -651,27 +655,35 @@ def read_sql( providing only the SQL tablename will result in an error. >>> from sqlite3 import connect - >>> conn = connect(':memory:') - >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']], - ... columns=['int_column', 'date_column']) - >>> df.to_sql(name='test_data', con=conn) + >>> conn = connect(":memory:") + >>> df = pd.DataFrame( + ... data=[[0, "10/11/12"], [1, "12/11/10"]], + ... columns=["int_column", "date_column"], + ... ) + >>> df.to_sql(name="test_data", con=conn) 2 - >>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn) + >>> pd.read_sql( + ... "SELECT int_column, date_column FROM test_data", conn + ... ) int_column date_column 0 0 10/11/12 1 1 12/11/10 - >>> pd.read_sql('test_data', 'postgres:///db_name') # doctest:+SKIP + >>> pd.read_sql( + ... "test_data", "postgres:///db_name" + ... ) # doctest:+SKIP Apply date parsing to columns through the ``parse_dates`` argument The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns. Custom argument values for applying ``pd.to_datetime`` on a column are specified via a dictionary format: - >>> pd.read_sql('SELECT int_column, date_column FROM test_data', - ... conn, - ... parse_dates={"date_column": {"format": "%d/%m/%y"}}) + >>> pd.read_sql( + ... "SELECT int_column, date_column FROM test_data", + ... conn, + ... parse_dates={"date_column": {"format": "%d/%m/%y"}}, + ... ) int_column date_column 0 0 2012-11-10 1 1 2010-11-12 @@ -681,8 +693,10 @@ def read_sql( pandas now supports reading via ADBC drivers >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP - >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP - ... pd.read_sql('SELECT int_column FROM test_data', conn) + >>> with dbapi.connect( + ... "postgres:///db_name" + ... ) as conn: # doctest:+SKIP + ... pd.read_sql("SELECT int_column FROM test_data", conn) int_column 0 0 1 1 diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d1484510b654f..f3ab2c2adf1e6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -254,7 +254,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: Examples -------- >>> dates = pd.Series([52]) - >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") + >>> _stata_elapsed_date_to_datetime_vec(dates, "%tw") 0 1961-01-01 dtype: datetime64[ns] @@ -1992,10 +1992,15 @@ def data_label(self) -> str: >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) >>> data_label = "This is a data file." >>> path = "/My_path/filename.dta" - >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP - ... data_label=data_label, # doctest: +SKIP - ... version=None) # doctest: +SKIP - >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP + >>> df.to_stata( + ... path, + ... time_stamp=time_stamp, # doctest: +SKIP + ... data_label=data_label, # doctest: +SKIP + ... version=None, + ... ) # doctest: +SKIP + >>> with pd.io.stata.StataReader( + ... path + ... ) as reader: # doctest: +SKIP ... print(reader.data_label) # doctest: +SKIP This is a data file. """ @@ -2020,13 +2025,21 @@ def variable_labels(self) -> dict[str, str]: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4]], columns=["col_1", "col_2"] + ... ) >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) >>> path = "/My_path/filename.dta" >>> variable_labels = {"col_1": "This is an example"} - >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP - ... variable_labels=variable_labels, version=None) # doctest: +SKIP - >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP + >>> df.to_stata( + ... path, + ... time_stamp=time_stamp, # doctest: +SKIP + ... variable_labels=variable_labels, + ... version=None, + ... ) # doctest: +SKIP + >>> with pd.io.stata.StataReader( + ... path + ... ) as reader: # doctest: +SKIP ... print(reader.variable_labels()) # doctest: +SKIP {'index': '', 'col_1': 'This is an example', 'col_2': ''} >>> pd.read_stata(path) # doctest: +SKIP @@ -2047,13 +2060,21 @@ def value_labels(self) -> dict[str, dict[float, str]]: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4]], columns=["col_1", "col_2"] + ... ) >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) >>> path = "/My_path/filename.dta" >>> value_labels = {"col_1": {3: "x"}} - >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP - ... value_labels=value_labels, version=None) # doctest: +SKIP - >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP + >>> df.to_stata( + ... path, + ... time_stamp=time_stamp, # doctest: +SKIP + ... value_labels=value_labels, + ... version=None, + ... ) # doctest: +SKIP + >>> with pd.io.stata.StataReader( + ... path + ... ) as reader: # doctest: +SKIP ... print(reader.value_labels()) # doctest: +SKIP {'col_1': {3: 'x'}} >>> pd.read_stata(path) # doctest: +SKIP @@ -2309,19 +2330,27 @@ class StataWriter(StataParser): Examples -------- - >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) - >>> writer = StataWriter('./data_file.dta', data) + >>> data = pd.DataFrame([[1.0, 1]], columns=["a", "b"]) + >>> writer = StataWriter("./data_file.dta", data) >>> writer.write_file() Directly write a zip file - >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}} - >>> writer = StataWriter('./data_file.zip', data, compression=compression) + >>> compression = { + ... {"method": "zip", "archive_name": "data_file.dta"} + ... } + >>> writer = StataWriter( + ... "./data_file.zip", data, compression=compression + ... ) >>> writer.write_file() Save a DataFrame with dates >>> from datetime import datetime - >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) - >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}}) + >>> data = pd.DataFrame( + ... [[datetime(2000, 1, 1)]], columns=["date"] + ... ) + >>> writer = StataWriter( + ... "./date_data_file.dta", data, {{"date": "tw"}} + ... ) >>> writer.write_file() """ @@ -2692,18 +2721,28 @@ def write_file(self) -> None: Examples -------- - >>> df = pd.DataFrame({"fully_labelled": [1, 2, 3, 3, 1], - ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan], - ... "Y": [7, 7, 9, 8, 10], - ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]), - ... }) + >>> df = pd.DataFrame( + ... { + ... "fully_labelled": [1, 2, 3, 3, 1], + ... "partially_labelled": [ + ... 1.0, + ... 2.0, + ... np.nan, + ... 9.0, + ... np.nan, + ... ], + ... "Y": [7, 7, 9, 8, 10], + ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]), + ... } + ... ) >>> path = "/My_path/filename.dta" - >>> labels = {"fully_labelled": {1: "one", 2: "two", 3: "three"}, - ... "partially_labelled": {1.0: "one", 2.0: "two"}, - ... } - >>> writer = pd.io.stata.StataWriter(path, - ... df, - ... value_labels=labels) # doctest: +SKIP + >>> labels = { + ... "fully_labelled": {1: "one", 2: "two", 3: "three"}, + ... "partially_labelled": {1.0: "one", 2.0: "two"}, + ... } + >>> writer = pd.io.stata.StataWriter( + ... path, df, value_labels=labels + ... ) # doctest: +SKIP >>> writer.write_file() # doctest: +SKIP >>> df = pd.read_stata(path) # doctest: +SKIP >>> df # doctest: +SKIP @@ -3263,22 +3302,32 @@ class StataWriter117(StataWriter): Examples -------- - >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c']) - >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data) + >>> data = pd.DataFrame( + ... [[1.0, 1, "a"]], columns=["a", "b", "c"] + ... ) + >>> writer = pd.io.stata.StataWriter117("./data_file.dta", data) >>> writer.write_file() Directly write a zip file - >>> compression = {"method": "zip", "archive_name": "data_file.dta"} + >>> compression = { + ... "method": "zip", + ... "archive_name": "data_file.dta", + ... } >>> writer = pd.io.stata.StataWriter117( - ... './data_file.zip', data, compression=compression - ... ) + ... "./data_file.zip", data, compression=compression + ... ) >>> writer.write_file() Or with long strings stored in strl format - >>> data = pd.DataFrame([['A relatively long string'], [''], ['']], - ... columns=['strls']) + >>> data = pd.DataFrame( + ... [["A relatively long string"], [""], [""]], + ... columns=["strls"], + ... ) >>> writer = pd.io.stata.StataWriter117( - ... './data_file_with_long_strings.dta', data, convert_strl=['strls']) + ... "./data_file_with_long_strings.dta", + ... data, + ... convert_strl=["strls"], + ... ) >>> writer.write_file() """ @@ -3656,21 +3705,33 @@ class StataWriterUTF8(StataWriter117): Using Unicode data and column names >>> from pandas.io.stata import StataWriterUTF8 - >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) - >>> writer = StataWriterUTF8('./data_file.dta', data) + >>> data = pd.DataFrame( + ... [[1.0, 1, "ᴬ"]], columns=["a", "β", "ĉ"] + ... ) + >>> writer = StataWriterUTF8("./data_file.dta", data) >>> writer.write_file() Directly write a zip file - >>> compression = {"method": "zip", "archive_name": "data_file.dta"} - >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression) + >>> compression = { + ... "method": "zip", + ... "archive_name": "data_file.dta", + ... } + >>> writer = StataWriterUTF8( + ... "./data_file.zip", data, compression=compression + ... ) >>> writer.write_file() Or with long strings stored in strl format - >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], - ... columns=['strls']) - >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, - ... convert_strl=['strls']) + >>> data = pd.DataFrame( + ... [["ᴀ relatively long ŝtring"], [""], [""]], + ... columns=["strls"], + ... ) + >>> writer = StataWriterUTF8( + ... "./data_file_with_long_strings.dta", + ... data, + ... convert_strl=["strls"], + ... ) >>> writer.write_file() """ diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ac497cd266027..d60ea79ffebd4 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -989,7 +989,15 @@ def read_xml( efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). For example, :: - iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} + iterparse = { + { + "row_element": [ + "child_elem", + "attr", + "grandchild_elem", + ] + } + } .. versionadded:: 1.5.0 @@ -1118,9 +1126,11 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml), - ... xpath="//doc:row", - ... namespaces={{"doc": "https://example.com"}}) + >>> df = pd.read_xml( + ... StringIO(xml), + ... xpath="//doc:row", + ... namespaces={{"doc": "https://example.com"}}, + ... ) >>> df shape degrees sides 0 square 360 4.0 @@ -1147,9 +1157,11 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml_data), - ... dtype_backend="numpy_nullable", - ... parse_dates=["e"]) + >>> df = pd.read_xml( + ... StringIO(xml_data), + ... dtype_backend="numpy_nullable", + ... parse_dates=["e"], + ... ) >>> df index a b c d e 0 0 1 2.5 True a 2019-12-31 diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 1355bda9025b9..a14812edd5124 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -112,7 +112,7 @@ def hist_series( .. plot:: :context: close-figs - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) >>> hist = ser.hist() @@ -121,7 +121,7 @@ def hist_series( .. plot:: :context: close-figs - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) >>> hist = ser.groupby(level=0).hist() """ @@ -241,9 +241,11 @@ def hist_frame( .. plot:: :context: close-figs - >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} - >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> data = { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... } + >>> index = ["pig", "rabbit", "duck", "chicken", "horse"] >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) """ @@ -605,19 +607,27 @@ def boxplot_frame_groupby( :context: close-figs >>> import itertools - >>> tuples = [t for t in itertools.product(range(1000), range(4))] - >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> tuples = [ + ... t for t in itertools.product(range(1000), range(4)) + ... ] + >>> index = pd.MultiIndex.from_tuples( + ... tuples, names=["lvl0", "lvl1"] + ... ) >>> data = np.random.randn(len(index), 4) - >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) - >>> grouped = df.groupby(level='lvl1') - >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP + >>> df = pd.DataFrame(data, columns=list("ABCD"), index=index) + >>> grouped = df.groupby(level="lvl1") + >>> grouped.boxplot( + ... rot=45, fontsize=12, figsize=(8, 10) + ... ) # doctest: +SKIP The ``subplots=False`` option shows the boxplots in a single figure. .. plot:: :context: close-figs - >>> grouped.boxplot(subplots=False, rot=45, fontsize=12) # doctest: +SKIP + >>> grouped.boxplot( + ... subplots=False, rot=45, fontsize=12 + ... ) # doctest: +SKIP """ plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame_groupby( @@ -802,16 +812,20 @@ class PlotAccessor(PandasObject): :context: close-figs >>> ser = pd.Series([1, 2, 3, 3]) - >>> plot = ser.plot(kind='hist', title="My plot") + >>> plot = ser.plot(kind="hist", title="My plot") For DataFrame: .. plot:: :context: close-figs - >>> df = pd.DataFrame({'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]}, - ... index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> df = pd.DataFrame( + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, + ... index=["pig", "rabbit", "duck", "chicken", "horse"], + ... ) >>> plot = df.plot(title="DataFrame Plot") For SeriesGroupBy: @@ -821,16 +835,21 @@ class PlotAccessor(PandasObject): >>> lst = [-1, -2, -3, 1, 2, 3] >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) - >>> plot = ser.groupby(lambda x: x > 0).plot(title="SeriesGroupBy Plot") + >>> plot = ser.groupby(lambda x: x > 0).plot( + ... title="SeriesGroupBy Plot" + ... ) For DataFrameGroupBy: .. plot:: :context: close-figs - >>> df = pd.DataFrame({"col1" : [1, 2, 3, 4], - ... "col2" : ["A", "B", "A", "B"]}) - >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot") + >>> df = pd.DataFrame( + ... {"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]} + ... ) + >>> plot = df.groupby("col2").plot( + ... kind="bar", title="DataFrameGroupBy Plot" + ... ) """ _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") @@ -1329,7 +1348,7 @@ def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor: :context: close-figs >>> data = np.random.randn(25, 4) - >>> df = pd.DataFrame(data, columns=list('ABCD')) + >>> df = pd.DataFrame(data, columns=list("ABCD")) >>> ax = df.plot.box() You can also generate groupings if you specify the `by` parameter (which @@ -1340,8 +1359,25 @@ def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor: .. plot:: :context: close-figs - >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] - >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) + >>> age_list = [ + ... 8, + ... 10, + ... 12, + ... 14, + ... 72, + ... 74, + ... 76, + ... 78, + ... 20, + ... 25, + ... 30, + ... 35, + ... 60, + ... 85, + ... ] + >>> df = pd.DataFrame( + ... {"gender": list("MMMMMMMMFFFFFF"), "age": age_list} + ... ) >>> ax = df.plot.box(column="age", by="gender", figsize=(10, 8)) """ return self(kind="box", by=by, **kwargs) @@ -1392,8 +1428,10 @@ def hist( .. plot:: :context: close-figs - >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) - >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) + >>> df = pd.DataFrame( + ... np.random.randint(1, 7, 6000), columns=["one"] + ... ) + >>> df["two"] = df["one"] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) A grouped histogram can be generated by providing the parameter `by` (which @@ -1402,9 +1440,28 @@ def hist( .. plot:: :context: close-figs - >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] - >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) - >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8)) + >>> age_list = [ + ... 8, + ... 10, + ... 12, + ... 14, + ... 72, + ... 74, + ... 76, + ... 78, + ... 20, + ... 25, + ... 30, + ... 35, + ... 60, + ... 85, + ... ] + >>> df = pd.DataFrame( + ... {"gender": list("MMMMMMMMFFFFFF"), "age": age_list} + ... ) + >>> ax = df.plot.hist( + ... column=["age"], by="gender", figsize=(10, 8) + ... ) """ return self(kind="hist", by=by, bins=bins, **kwargs) @@ -1491,10 +1548,12 @@ def kde( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], - ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], - ... }) + >>> df = pd.DataFrame( + ... { + ... "x": [1, 2, 2.5, 3, 3.5, 4, 5], + ... "y": [4, 4, 4.5, 5, 5.5, 6, 6], + ... } + ... ) >>> ax = df.plot.kde() A scalar bandwidth can be specified. Using a small bandwidth value can @@ -1565,12 +1624,16 @@ def area( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'sales': [3, 2, 3, 9, 10, 6], - ... 'signups': [5, 5, 6, 12, 14, 13], - ... 'visits': [20, 42, 28, 62, 81, 50], - ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', - ... freq='ME')) + >>> df = pd.DataFrame( + ... { + ... "sales": [3, 2, 3, 9, 10, 6], + ... "signups": [5, 5, 6, 12, 14, 13], + ... "visits": [20, 42, 28, 62, 81, 50], + ... }, + ... index=pd.date_range( + ... start="2018/01/01", end="2018/07/01", freq="ME" + ... ), + ... ) >>> ax = df.plot.area() Area plots are stacked by default. To produce an unstacked plot, @@ -1586,19 +1649,21 @@ def area( .. plot:: :context: close-figs - >>> ax = df.plot.area(y='sales') + >>> ax = df.plot.area(y="sales") Draw with a different `x`: .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'sales': [3, 2, 3], - ... 'visits': [20, 42, 28], - ... 'day': [1, 2, 3], - ... }) - >>> ax = df.plot.area(x='day') + >>> df = pd.DataFrame( + ... { + ... "sales": [3, 2, 3], + ... "visits": [20, 42, 28], + ... "day": [1, 2, 3], + ... } + ... ) + >>> ax = df.plot.area(x="day") """ return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) @@ -1639,10 +1704,14 @@ def pie(self, **kwargs) -> PlotAccessor: .. plot:: :context: close-figs - >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97], - ... 'radius': [2439.7, 6051.8, 6378.1]}, - ... index=['Mercury', 'Venus', 'Earth']) - >>> plot = df.plot.pie(y='mass', figsize=(5, 5)) + >>> df = pd.DataFrame( + ... { + ... "mass": [0.330, 4.87, 5.97], + ... "radius": [2439.7, 6051.8, 6378.1], + ... }, + ... index=["Mercury", "Venus", "Earth"], + ... ) + >>> plot = df.plot.pie(y="mass", figsize=(5, 5)) .. plot:: :context: close-figs @@ -1728,22 +1797,26 @@ def scatter( .. plot:: :context: close-figs - >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], - ... [6.4, 3.2, 1], [5.9, 3.0, 2]], - ... columns=['length', 'width', 'species']) - >>> ax1 = df.plot.scatter(x='length', - ... y='width', - ... c='DarkBlue') + >>> df = pd.DataFrame( + ... [ + ... [5.1, 3.5, 0], + ... [4.9, 3.0, 0], + ... [7.0, 3.2, 1], + ... [6.4, 3.2, 1], + ... [5.9, 3.0, 2], + ... ], + ... columns=["length", "width", "species"], + ... ) + >>> ax1 = df.plot.scatter(x="length", y="width", c="DarkBlue") And now with the color determined by a column as well. .. plot:: :context: close-figs - >>> ax2 = df.plot.scatter(x='length', - ... y='width', - ... c='species', - ... colormap='viridis') + >>> ax2 = df.plot.scatter( + ... x="length", y="width", c="species", colormap="viridis" + ... ) """ return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs) @@ -1812,9 +1885,10 @@ def hexbin( :context: close-figs >>> n = 10000 - >>> df = pd.DataFrame({'x': np.random.randn(n), - ... 'y': np.random.randn(n)}) - >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20) + >>> df = pd.DataFrame( + ... {"x": np.random.randn(n), "y": np.random.randn(n)} + ... ) + >>> ax = df.plot.hexbin(x="x", y="y", gridsize=20) The next example uses `C` and `np.sum` as `reduce_C_function`. Note that `'observations'` values ranges from 1 to 5 but the result @@ -1825,17 +1899,21 @@ def hexbin( :context: close-figs >>> n = 500 - >>> df = pd.DataFrame({ - ... 'coord_x': np.random.uniform(-3, 3, size=n), - ... 'coord_y': np.random.uniform(30, 50, size=n), - ... 'observations': np.random.randint(1,5, size=n) - ... }) - >>> ax = df.plot.hexbin(x='coord_x', - ... y='coord_y', - ... C='observations', - ... reduce_C_function=np.sum, - ... gridsize=10, - ... cmap="viridis") + >>> df = pd.DataFrame( + ... { + ... "coord_x": np.random.uniform(-3, 3, size=n), + ... "coord_y": np.random.uniform(30, 50, size=n), + ... "observations": np.random.randint(1, 5, size=n), + ... } + ... ) + >>> ax = df.plot.hexbin( + ... x="coord_x", + ... y="coord_y", + ... C="observations", + ... reduce_C_function=np.sum, + ... gridsize=10, + ... cmap="viridis", + ... ) """ if reduce_C_function is not None: kwargs["reduce_C_function"] = reduce_C_function diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2a1503b84a634..a06a7bba247a4 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -465,7 +465,7 @@ def _validate_color_args(self, color, colormap): @final @staticmethod def _iter_data( - data: DataFrame | dict[Hashable, Series | DataFrame] + data: DataFrame | dict[Hashable, Series | DataFrame], ) -> Iterator[tuple[Hashable, np.ndarray]]: for col, values in data.items(): # This was originally written to use values.values before EAs @@ -616,8 +616,7 @@ def result(self): # error: Argument 1 to "len" has incompatible type "Union[bool, # Tuple[Any, ...], List[Any], ndarray[Any, Any]]"; expected "Sized" all_sec = ( - is_list_like(self.secondary_y) - and len(self.secondary_y) == self.nseries # type: ignore[arg-type] + is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries # type: ignore[arg-type] ) if sec_true or all_sec: # if all data is plotted on secondary, return right axes diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index cbb66065a8039..544591d09345f 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -50,10 +50,18 @@ def create_iter_data_given_by( If `by` is assigned: >>> import numpy as np - >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] + >>> tuples = [ + ... ("h1", "a"), + ... ("h1", "b"), + ... ("h2", "a"), + ... ("h2", "b"), + ... ] >>> mi = pd.MultiIndex.from_tuples(tuples) - >>> value = [[1, 3, np.nan, np.nan], - ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] + >>> value = [ + ... [1, 3, np.nan, np.nan], + ... [3, 4, np.nan, np.nan], + ... [np.nan, np.nan, 5, 6], + ... ] >>> data = pd.DataFrame(value, columns=mi) >>> create_iter_data_given_by(data) {'h1': h1 @@ -106,9 +114,13 @@ def reconstruct_data_with_by( Examples -------- - >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} + >>> d = { + ... "h": ["h1", "h1", "h2"], + ... "a": [1, 3, 5], + ... "b": [3, 4, 6], + ... } >>> df = pd.DataFrame(d) - >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) + >>> reconstruct_data_with_by(df, by="h", cols=["a", "b"]) h1 h2 a b a b 0 1.0 3.0 NaN NaN diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index e610f1adb602c..d936f65ca6478 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -202,17 +202,13 @@ def _post_plot_logic(self, ax: Axes, data) -> None: # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible # type "Hashable"; expected "str" ax.set_xlabel( - "Frequency" - if self.xlabel is None - else self.xlabel # type: ignore[arg-type] + "Frequency" if self.xlabel is None else self.xlabel # type: ignore[arg-type] ) ax.set_ylabel(self.ylabel) # type: ignore[arg-type] else: ax.set_xlabel(self.xlabel) # type: ignore[arg-type] ax.set_ylabel( - "Frequency" - if self.ylabel is None - else self.ylabel # type: ignore[arg-type] + "Frequency" if self.ylabel is None else self.ylabel # type: ignore[arg-type] ) @property diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index bf1c0f6346f02..067bcf0b01ccb 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -250,9 +250,7 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: if isinstance(data.index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) - base = to_offset( - freq_str, is_period=True - )._period_dtype_code # type: ignore[attr-defined] + base = to_offset(freq_str, is_period=True)._period_dtype_code # type: ignore[attr-defined] x = data.index if base <= FreqGroup.FR_DAY.value: return x[:1].is_normalized diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 898b5b25e7b01..f2ec45a7c47e3 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -101,13 +101,15 @@ def _get_layout( nrows, ncols = layout if nrows == -1 and ncols > 0: - layout = nrows, ncols = (ceil(nplots / ncols), ncols) + layout = (ceil(nplots / ncols), ncols) elif ncols == -1 and nrows > 0: - layout = nrows, ncols = (nrows, ceil(nplots / nrows)) + layout = (nrows, ceil(nplots / nrows)) elif ncols <= 0 and nrows <= 0: msg = "At least one dimension of layout must be positive" raise ValueError(msg) + nrows, ncols = layout + if nrows * ncols < nplots: raise ValueError( f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 18db460d388a4..108264c892cbf 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -51,12 +51,17 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: :context: close-figs >>> import matplotlib.pyplot as plt - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> fix, ax = plt.subplots() - >>> ax.axis('off') + >>> ax.axis("off") (0.0, 1.0, 0.0, 1.0) - >>> table = pd.plotting.table(ax, df, loc='center', - ... cellLoc='center', colWidths=list([.2, .2])) + >>> table = pd.plotting.table( + ... ax, + ... df, + ... loc="center", + ... cellLoc="center", + ... colWidths=list([0.2, 0.2]), + ... ) """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.table( @@ -92,16 +97,20 @@ def register() -> None: >>> pd.plotting.register_matplotlib_converters() - >>> df = pd.DataFrame({'ts': pd.period_range('2020', periods=2, freq='M'), - ... 'y': [1, 2] - ... }) - >>> plot = df.plot.line(x='ts', y='y') + >>> df = pd.DataFrame( + ... { + ... "ts": pd.period_range("2020", periods=2, freq="M"), + ... "y": [1, 2], + ... } + ... ) + >>> plot = df.plot.line(x="ts", y="y") Unsetting the register manually an error will be raised: - >>> pd.set_option("plotting.matplotlib.register_converters", - ... False) # doctest: +SKIP - >>> df.plot.line(x='ts', y='y') # doctest: +SKIP + >>> pd.set_option( + ... "plotting.matplotlib.register_converters", False + ... ) # doctest: +SKIP + >>> df.plot.line(x="ts", y="y") # doctest: +SKIP Traceback (most recent call last): TypeError: float() argument must be a string or a real number, not 'Period' """ @@ -135,16 +144,20 @@ def deregister() -> None: >>> pd.plotting.register_matplotlib_converters() - >>> df = pd.DataFrame({'ts': pd.period_range('2020', periods=2, freq='M'), - ... 'y': [1, 2] - ... }) - >>> plot = df.plot.line(x='ts', y='y') + >>> df = pd.DataFrame( + ... { + ... "ts": pd.period_range("2020", periods=2, freq="M"), + ... "y": [1, 2], + ... } + ... ) + >>> plot = df.plot.line(x="ts", y="y") Unsetting the register manually an error will be raised: - >>> pd.set_option("plotting.matplotlib.register_converters", - ... False) # doctest: +SKIP - >>> df.plot.line(x='ts', y='y') # doctest: +SKIP + >>> pd.set_option( + ... "plotting.matplotlib.register_converters", False + ... ) # doctest: +SKIP + >>> df.plot.line(x="ts", y="y") # doctest: +SKIP Traceback (most recent call last): TypeError: float() argument must be a string or a real number, not 'Period' """ @@ -204,7 +217,9 @@ def scatter_matrix( .. plot:: :context: close-figs - >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> df = pd.DataFrame( + ... np.random.randn(1000, 4), columns=["A", "B", "C", "D"] + ... ) >>> pd.plotting.scatter_matrix(df, alpha=0.2) array([[, , , ], @@ -288,25 +303,69 @@ def radviz( >>> df = pd.DataFrame( ... { - ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, 6.7, 4.6], - ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, 3.3, 3.6], - ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, 5.7, 1.0], - ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, 2.1, 0.2], - ... 'Category': [ - ... 'virginica', - ... 'virginica', - ... 'setosa', - ... 'virginica', - ... 'virginica', - ... 'versicolor', - ... 'versicolor', - ... 'setosa', - ... 'virginica', - ... 'setosa' - ... ] + ... "SepalLength": [ + ... 6.5, + ... 7.7, + ... 5.1, + ... 5.8, + ... 7.6, + ... 5.0, + ... 5.4, + ... 4.6, + ... 6.7, + ... 4.6, + ... ], + ... "SepalWidth": [ + ... 3.0, + ... 3.8, + ... 3.8, + ... 2.7, + ... 3.0, + ... 2.3, + ... 3.0, + ... 3.2, + ... 3.3, + ... 3.6, + ... ], + ... "PetalLength": [ + ... 5.5, + ... 6.7, + ... 1.9, + ... 5.1, + ... 6.6, + ... 3.3, + ... 4.5, + ... 1.4, + ... 5.7, + ... 1.0, + ... ], + ... "PetalWidth": [ + ... 1.8, + ... 2.2, + ... 0.4, + ... 1.9, + ... 2.1, + ... 1.0, + ... 1.5, + ... 0.2, + ... 2.1, + ... 0.2, + ... ], + ... "Category": [ + ... "virginica", + ... "virginica", + ... "setosa", + ... "virginica", + ... "virginica", + ... "versicolor", + ... "versicolor", + ... "setosa", + ... "virginica", + ... "setosa", + ... ], ... } ... ) - >>> pd.plotting.radviz(df, 'Category') # doctest: +SKIP + >>> pd.plotting.radviz(df, "Category") # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.radviz( @@ -371,10 +430,10 @@ def andrews_curves( :context: close-figs >>> df = pd.read_csv( - ... 'https://raw.githubusercontent.com/pandas-dev/' - ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' + ... "https://raw.githubusercontent.com/pandas-dev/" + ... "pandas/main/pandas/tests/io/data/csv/iris.csv" ... ) - >>> pd.plotting.andrews_curves(df, 'Name') # doctest: +SKIP + >>> pd.plotting.andrews_curves(df, "Name") # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.andrews_curves( @@ -502,11 +561,11 @@ def parallel_coordinates( :context: close-figs >>> df = pd.read_csv( - ... 'https://raw.githubusercontent.com/pandas-dev/' - ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' + ... "https://raw.githubusercontent.com/pandas-dev/" + ... "pandas/main/pandas/tests/io/data/csv/iris.csv" ... ) >>> pd.plotting.parallel_coordinates( - ... df, 'Name', color=('#556270', '#4ECDC4', '#C7F464') + ... df, "Name", color=("#556270", "#4ECDC4", "#C7F464") ... ) # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") @@ -598,7 +657,9 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax :context: close-figs >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000) - >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing)) + >>> s = pd.Series( + ... 0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing) + ... ) >>> pd.plotting.autocorrelation_plot(s) # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") @@ -620,10 +681,10 @@ class _Options(dict): :context: close-figs >>> np.random.seed(42) - >>> df = pd.DataFrame({'A': np.random.randn(10), - ... 'B': np.random.randn(10)}, - ... index=pd.date_range("1/1/2000", - ... freq='4MS', periods=10)) + >>> df = pd.DataFrame( + ... {"A": np.random.randn(10), "B": np.random.randn(10)}, + ... index=pd.date_range("1/1/2000", freq="4MS", periods=10), + ... ) >>> with pd.plotting.plot_params.use("x_compat", True): ... _ = df["A"].plot(color="r") ... _ = df["B"].plot(color="g") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ff2cfc1278331..771b5591792c2 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -654,9 +654,7 @@ def test_convert_numeric_uint64_nan_values( arr = np.array([2**63, 2**63 + 1], dtype=object) na_values = {2**63} - expected = ( - np.array([np.nan, 2**63 + 1], dtype=float) if coerce else arr.copy() - ) + expected = np.array([np.nan, 2**63 + 1], dtype=float) if coerce else arr.copy() result = lib.maybe_convert_numeric( arr, na_values, diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index e1f8d8eca2537..7105755df6f88 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -843,7 +843,8 @@ def test_empty_like(self): class TestLibMissing: @pytest.mark.parametrize("func", [libmissing.checknull, isna]) @pytest.mark.parametrize( - "value", na_vals + sometimes_na_vals # type: ignore[operator] + "value", + na_vals + sometimes_na_vals, # type: ignore[operator] ) def test_checknull_na_vals(self, func, value): assert func(value) @@ -864,7 +865,8 @@ def test_checknull_never_na_vals(self, func, value): assert not func(value) @pytest.mark.parametrize( - "value", na_vals + sometimes_na_vals # type: ignore[operator] + "value", + na_vals + sometimes_na_vals, # type: ignore[operator] ) def test_checknull_old_na_vals(self, value): assert libmissing.checknull(value, inf_as_na=True) diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py index 212e56442ee07..2170cf254fbe6 100644 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -61,17 +61,13 @@ def test_first_last_raises(self, frame_or_series): msg = "'first' only supports a DatetimeIndex index" with tm.assert_produces_warning( FutureWarning, match=deprecated_msg - ), pytest.raises( - TypeError, match=msg - ): # index is not a DatetimeIndex + ), pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex obj.first("1D") msg = "'last' only supports a DatetimeIndex index" with tm.assert_produces_warning( FutureWarning, match=last_deprecated_msg - ), pytest.raises( - TypeError, match=msg - ): # index is not a DatetimeIndex + ), pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex obj.last("1D") def test_last_subset(self, frame_or_series): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8d7a0b373f5f8..4125c13701046 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -324,9 +324,7 @@ def test_rank_pct_true(self, method, exp): @pytest.mark.single_cpu def test_pct_max_many_rows(self): # GH 18271 - df = DataFrame( - {"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)} - ) + df = DataFrame({"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)}) result = df.rank(pct=True).max() assert (result == 1).all() diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fbf36dbc4fb02..9d07b8ab2288f 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -232,9 +232,7 @@ def test_reset_index_level_missing(self, idx_lev): def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) - s1 = Series( - (9.81 * time**2) / 2, index=Index(time, name="time"), name="speed" - ) + s1 = Series((9.81 * time**2) / 2, index=Index(time, name="time"), name="speed") df = DataFrame(s1) reset = s1.reset_index() diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c3bcd30796e63..b6bb73f43c1ac 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1188,9 +1188,7 @@ def test_agg_with_one_lambda(self): # check pd.NameAgg case result1 = df.groupby(by="kind").agg( - height_sqr_min=pd.NamedAgg( - column="height", aggfunc=lambda x: np.min(x**2) - ), + height_sqr_min=pd.NamedAgg(column="height", aggfunc=lambda x: np.min(x**2)), height_max=pd.NamedAgg(column="height", aggfunc="max"), weight_max=pd.NamedAgg(column="weight", aggfunc="max"), ) @@ -1245,9 +1243,7 @@ def test_agg_multiple_lambda(self): # check pd.NamedAgg case result2 = df.groupby(by="kind").agg( - height_sqr_min=pd.NamedAgg( - column="height", aggfunc=lambda x: np.min(x**2) - ), + height_sqr_min=pd.NamedAgg(column="height", aggfunc=lambda x: np.min(x**2)), height_max=pd.NamedAgg(column="height", aggfunc="max"), weight_max=pd.NamedAgg(column="weight", aggfunc="max"), height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index e93fc0e2a4e2e..f766894a993a0 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -135,7 +135,8 @@ def test_dti_hour_tzaware(self, prefix): # GH#12806 # error: Unsupported operand types for + ("List[None]" and "List[str]") @pytest.mark.parametrize( - "time_locale", [None] + tm.get_locales() # type: ignore[operator] + "time_locale", + [None] + tm.get_locales(), # type: ignore[operator] ) def test_day_name_month_name(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index edd0feaaa1159..8411ba6235b40 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -258,7 +258,7 @@ def test_join_dtypes_all_nan(any_numeric_ea_dtype): def test_join_index_levels(): # GH#53093 - midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")]) + midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")]) midx2 = MultiIndex.from_tuples([("a", "2019-01-31")]) result = midx.join(midx2, how="outer") expected = MultiIndex.from_tuples( diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index b4dcef71dcf50..4a1a6b9c452d5 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -151,7 +151,7 @@ def test_unsortedindex_doc_examples(): msg = r"Key length \(2\) was greater than MultiIndex lexsort depth \(1\)" with pytest.raises(UnsortedIndexError, match=msg): - dfm.loc[(0, "y"):(1, "z")] + dfm.loc[(0, "y") : (1, "z")] assert not dfm.index._is_lexsorted() assert dfm.index._lexsort_depth == 1 @@ -159,7 +159,7 @@ def test_unsortedindex_doc_examples(): # sort it dfm = dfm.sort_index() dfm.loc[(1, "z")] - dfm.loc[(0, "y"):(1, "z")] + dfm.loc[(0, "y") : (1, "z")] assert dfm.index._is_lexsorted() assert dfm.index._lexsort_depth == 2 diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index cd28d519313ed..762009f54ccef 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -406,8 +406,9 @@ def test_get_indexer_arrow_dictionary_target(self): tm.assert_numpy_array_equal(result, expected) result_1, result_2 = idx.get_indexer_non_unique(target) - expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array( - [1], dtype=np.int64 + expected_1, expected_2 = ( + np.array([0, -1], dtype=np.int64), + np.array([1], dtype=np.int64), ) tm.assert_numpy_array_equal(result_1, expected_1) tm.assert_numpy_array_equal(result_2, expected_2) diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 918d505216735..9839f40861d55 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -313,15 +313,11 @@ def test_join_right(self, index_large): tm.assert_numpy_array_equal(ridx, eridx) def test_join_non_int_index(self, index_large): - other = Index( - 2**63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object - ) + other = Index(2**63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object) outer = index_large.join(other, how="outer") outer2 = other.join(index_large, how="outer") - expected = Index( - 2**63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") - ) + expected = Index(2**63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64")) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) @@ -353,9 +349,7 @@ def test_join_outer(self, index_large): noidx_res = index_large.join(other, how="outer") tm.assert_index_equal(res, noidx_res) - eres = Index( - 2**63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64") - ) + eres = Index(2**63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64")) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index fdf88b2a97e46..eb6f51856ca54 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -95,7 +95,7 @@ def test_fancy_slice_partial( tm.assert_frame_equal(result, expected) ymd = multiindex_year_month_day_dataframe_random_data - result = ymd.loc[(2000, 2):(2000, 4)] + result = ymd.loc[(2000, 2) : (2000, 4)] lev = ymd.index.codes[1] expected = ymd[(lev >= 1) & (lev <= 3)] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index cef3dca054758..7f298e9bdd375 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -700,21 +700,23 @@ def test_multiindex_label_slicing_with_negative_step(self): tm.assert_indexing_slices_equivalent(ser, SLC[::-1], SLC[::-1]) tm.assert_indexing_slices_equivalent(ser, SLC["d"::-1], SLC[15::-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[("d",)::-1], SLC[15::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",) :: -1], SLC[15::-1]) tm.assert_indexing_slices_equivalent(ser, SLC[:"d":-1], SLC[:11:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[:("d",):-1], SLC[:11:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[: ("d",) : -1], SLC[:11:-1]) tm.assert_indexing_slices_equivalent(ser, SLC["d":"b":-1], SLC[15:3:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[("d",):"b":-1], SLC[15:3:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC["d":("b",):-1], SLC[15:3:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[("d",):("b",):-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",) : "b" : -1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["d" : ("b",) : -1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent( + ser, SLC[("d",) : ("b",) : -1], SLC[15:3:-1] + ) tm.assert_indexing_slices_equivalent(ser, SLC["b":"d":-1], SLC[:0]) - tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2)::-1], SLC[10::-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[:("c", 2):-1], SLC[:9:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2) :: -1], SLC[10::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[: ("c", 2) : -1], SLC[:9:-1]) tm.assert_indexing_slices_equivalent( - ser, SLC[("e", 0):("c", 2):-1], SLC[16:9:-1] + ser, SLC[("e", 0) : ("c", 2) : -1], SLC[16:9:-1] ) def test_multiindex_slice_first_level(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ce7dde3c4cb42..c49a85397f738 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1829,7 +1829,7 @@ def test_loc_setitem_multiindex_slice(self): ) result = Series([1, 1, 1, 1, 1, 1, 1, 1], index=index) - result.loc[("baz", "one"):("foo", "two")] = 100 + result.loc[("baz", "one") : ("foo", "two")] = 100 expected = Series([1, 1, 100, 100, 100, 100, 1, 1], index=index) @@ -2854,7 +2854,7 @@ def test_loc_axis_1_slice(): index=tuple("ABCDEFGHIJ"), columns=MultiIndex.from_tuples(cols), ) - result = df.loc(axis=1)[(2014, 9):(2015, 8)] + result = df.loc(axis=1)[(2014, 9) : (2015, 8)] expected = DataFrame( np.ones((10, 4)), index=tuple("ABCDEFGHIJ"), diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2265522bc7ecb..7041a5a0a293b 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -201,7 +201,7 @@ def create_mgr(descr, item_shape=None): * components with same DTYPE_ID are combined into single block * to force multiple blocks with same dtype, use '-SUFFIX':: - 'a:f8-1; b:f8-2; c:f8-foobar' + "a:f8-1; b:f8-2; c:f8-foobar" """ if item_shape is None: diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 1e345eb82ed3c..8cb06e3b7619d 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -93,11 +93,7 @@ def test_w3_html_format(styler): lambda x: "att1:v1;" ).set_table_attributes('class="my-cls1" style="attr3:v3;"').set_td_classes( DataFrame(["my-cls2"], index=["a"], columns=["A"]) - ).format( - "{:.1f}" - ).set_caption( - "A comprehensive test" - ) + ).format("{:.1f}").set_caption("A comprehensive test") expected = dedent( """\