Input/Output (io)

`read_commandline(cmd, engine='pandas', **kwargs)`

Read a CSV file based on a command-line command.

For example, you may wish to run the following command on sep-quarter.csv before reading it into a pandas DataFrame:

cat sep-quarter.csv | grep .SEA1AA

In this case, you can use the following Python code to load the dataframe:

import janitor as jn
df = jn.read_commandline("cat data.csv | grep .SEA1AA")

This function assumes that your command line command will return an output that is parsable using the relevant engine and StringIO. This function defaults to using pd.read_csv underneath the hood. Keyword arguments are passed through as-is.

Parameters:

Name	Type	Description	Default
`cmd`	`str`	Shell command to preprocess a file on disk.	required
`engine`	`str`	DataFrame engine to process the output of the shell command. Currently supports both pandas and polars.	`'pandas'`
`**kwargs`	`Any`	Keyword arguments that are passed through to the engine's csv reader.	`{}`

Returns:

Type	Description
`Mapping`	A DataFrame parsed from the stdout of the underlying shell.

Source code in janitor/io.py

def read_commandline(
    cmd: str, engine: str = "pandas", **kwargs: Any
) -> Mapping:
    """Read a CSV file based on a command-line command.

    For example, you may wish to run the following command on `sep-quarter.csv`
    before reading it into a pandas DataFrame:

    ```bash
    cat sep-quarter.csv | grep .SEA1AA
    ```

    In this case, you can use the following Python code to load the dataframe:

    ```python
    import janitor as jn
    df = jn.read_commandline("cat data.csv | grep .SEA1AA")
    ```

    This function assumes that your command line command will return
    an output that is parsable using the relevant engine and StringIO.
    This function defaults to using `pd.read_csv` underneath the hood.
    Keyword arguments are passed through as-is.

    Args:
        cmd: Shell command to preprocess a file on disk.
        engine: DataFrame engine to process the output of the shell command.
            Currently supports both pandas and polars.
        **kwargs: Keyword arguments that are passed through to
            the engine's csv reader.


    Returns:
        A DataFrame parsed from the stdout of the underlying
            shell.
    """

    check("cmd", cmd, [str])
    if engine not in {"pandas", "polars"}:
        raise ValueError("engine should be either pandas or polars.")
    # adding check=True ensures that an explicit, clear error
    # is raised, so that the user can see the reason for the failure
    outcome = subprocess.run(
        cmd, shell=True, capture_output=True, text=True, check=True
    )
    if engine == "polars":
        try:
            import polars as pl
        except ImportError:
            import_message(
                submodule="polars",
                package="polars",
                conda_channel="conda-forge",
                pip_install=True,
            )
        return pl.read_csv(StringIO(outcome.stdout), **kwargs)
    return pd.read_csv(StringIO(outcome.stdout), **kwargs)

`read_csvs(files_path, separate_df=False, **kwargs)`

Read multiple CSV files and return a dictionary of DataFrames, or one concatenated DataFrame.

Parameters:

Name	Type	Description	Default
`files_path`	`Union[str, Iterable[str]]`	The filepath pattern matching the CSV files. Accepts regular expressions, with or without `.csv` extension. Also accepts iterable of file paths.	required
`separate_df`	`bool`	If `False` (default), returns a single Dataframe with the concatenation of the csv files. If `True`, returns a dictionary of separate DataFrames for each CSV file.	`False`
`**kwargs`	`Any`	Keyword arguments to pass into the original pandas `read_csv`.	`{}`

Raises:

Type	Description
`JanitorError`	If `None` provided for `files_path`.
`JanitorError`	If length of `files_path` is `0`.
`ValueError`	If no CSV files exist in `files_path`.
`ValueError`	If columns in input CSV files do not match.

Returns:

Type	Description
`Union[DataFrame, dict]`	DataFrame of concatenated DataFrames or dictionary of DataFrames.

Source code in janitor/io.py

@deprecated_alias(seperate_df="separate_df", filespath="files_path")
def read_csvs(
    files_path: Union[str, Iterable[str]],
    separate_df: bool = False,
    **kwargs: Any,
) -> Union[pd.DataFrame, dict]:
    """Read multiple CSV files and return a dictionary of DataFrames, or
    one concatenated DataFrame.

    Args:
        files_path: The filepath pattern matching the CSV files.
            Accepts regular expressions, with or without `.csv` extension.
            Also accepts iterable of file paths.
        separate_df: If `False` (default), returns a single Dataframe
            with the concatenation of the csv files.
            If `True`, returns a dictionary of separate DataFrames
            for each CSV file.
        **kwargs: Keyword arguments to pass into the
            original pandas `read_csv`.

    Raises:
        JanitorError: If `None` provided for `files_path`.
        JanitorError: If length of `files_path` is `0`.
        ValueError: If no CSV files exist in `files_path`.
        ValueError: If columns in input CSV files do not match.

    Returns:
        DataFrame of concatenated DataFrames or dictionary of DataFrames.
    """
    # Sanitize input
    if files_path is None:
        raise JanitorError("`None` provided for `files_path`")
    if not files_path:
        raise JanitorError("0 length `files_path` provided")

    # Read the csv files
    # String to file/folder or file pattern provided
    if isinstance(files_path, str):
        dfs_dict = {
            os.path.basename(f): pd.read_csv(f, **kwargs)
            for f in glob(files_path)
        }
    # Iterable of file paths provided
    else:
        dfs_dict = {
            os.path.basename(f): pd.read_csv(f, **kwargs) for f in files_path
        }
    # Check if dataframes have been read
    if not dfs_dict:
        raise ValueError("No CSV files to read with the given `files_path`")
    # Concatenate the dataframes if requested (default)
    col_names = list(dfs_dict.values())[0].columns  # noqa: PD011
    if not separate_df:
        # If columns do not match raise an error
        for df in dfs_dict.values():  # noqa: PD011
            if not all(df.columns == col_names):
                raise ValueError(
                    "Columns in input CSV files do not match."
                    "Files cannot be concatenated."
                )
        return pd.concat(
            list(dfs_dict.values()),
            ignore_index=True,
            sort=False,  # noqa: PD011
            copy=False,
        )
    return dfs_dict

`xlsx_cells(path, sheetnames=None, start_point=None, end_point=None, read_only=True, include_blank_cells=True, fill=False, font=False, alignment=False, border=False, protection=False, comment=False, engine='pandas', **kwargs)`

Imports data from spreadsheet without coercing it into a rectangle.

Each cell is represented by a row in a dataframe, and includes the cell's coordinates, the value, row and column position. The cell formatting (fill, font, border, etc) can also be accessed; usually this is returned as a dictionary in the cell, and the specific cell format attribute can be accessed using pd.Series.str.get or pl.struct.field if it is a polars DataFrame.

Inspiration for this comes from R's tidyxl package.

Examples:

>>> import pandas as pd
>>> import polars as pl
>>> from janitor import xlsx_cells
>>> pd.set_option("display.max_columns", None)
>>> pd.set_option("display.expand_frame_repr", False)
>>> pd.set_option("max_colwidth", None)
>>> filename = "../pyjanitor/tests/test_data/worked-examples.xlsx"

Each cell is returned as a row:

>>> xlsx_cells(filename, sheetnames="highlights")
    value internal_value coordinate  row  column data_type  is_date number_format
0     Age            Age         A1    1       1         s    False       General
1  Height         Height         B1    1       2         s    False       General
2       1              1         A2    2       1         n    False       General
3       2              2         B2    2       2         n    False       General
4       3              3         A3    3       1         n    False       General
5       4              4         B3    3       2         n    False       General
6       5              5         A4    4       1         n    False       General
7       6              6         B4    4       2         n    False       General

Access cell formatting such as fill:

>>> out=xlsx_cells(filename, sheetnames="highlights", fill=True).select("value", "fill", axis='columns')
>>> out
    value                                                                                                                                              fill
0     Age     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
1  Height     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
2       1     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
3       2     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
4       3  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
5       4  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
6       5     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
7       6     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}

Specific cell attributes can be accessed by using Pandas' series.str.get:

>>> out.fill.str.get("fgColor").str.get("rgb")
0    00000000
1    00000000
2    00000000
3    00000000
4    FFFFFF00
5    FFFFFF00
6    00000000
7    00000000
Name: fill, dtype: object

Access cell formatting in a polars DataFrame:

>>> out = xlsx_cells(filename, sheetnames="highlights", engine='polars', fill=True).get_column('fill')
>>> out
shape: (8,)
Series: 'fill' [struct[3]]
[
   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
   {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
   {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
   {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
]

Specific cell attributes can be acessed via Polars' struct:

>>> out.struct.field('fgColor').struct.field('rgb')
shape: (8,)
Series: 'rgb' [str]
[
   "00000000"
   "00000000"
   "00000000"
   "00000000"
   "FFFFFF00"
   "FFFFFF00"
   "00000000"
   "00000000"
]

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Workbook]`	Path to the Excel File. It can also be an openpyxl Workbook.	required
`sheetnames`	`Union[str, list, tuple]`	Names of the sheets from which the cells are to be extracted. If `None`, all the sheets in the file are extracted; if it is a string, or list or tuple, only the specified sheets are extracted.	`None`
`start_point`	`Union[str, int]`	Start coordinates of the Excel sheet. This is useful if the user is only interested in a subsection of the sheet. If `start_point` is provided, `end_point` must be provided as well.	`None`
`end_point`	`Union[str, int]`	End coordinates of the Excel sheet. This is useful if the user is only interested in a subsection of the sheet. If `end_point` is provided, `start_point` must be provided as well.	`None`
`read_only`	`bool`	Determines if the entire file is loaded in memory, or streamed. For memory efficiency, read_only should be set to `True`. Some cell properties like `comment`, can only be accessed by setting `read_only` to `False`.	`True`
`include_blank_cells`	`bool`	Determines if cells without a value should be included.	`True`
`fill`	`bool`	If `True`, return fill properties of the cell. It is usually returned as a dictionary.	`False`
`font`	`bool`	If `True`, return font properties of the cell. It is usually returned as a dictionary.	`False`
`alignment`	`bool`	If `True`, return alignment properties of the cell. It is usually returned as a dictionary.	`False`
`border`	`bool`	If `True`, return border properties of the cell. It is usually returned as a dictionary.	`False`
`protection`	`bool`	If `True`, return protection properties of the cell. It is usually returned as a dictionary.	`False`
`comment`	`bool`	If `True`, return comment properties of the cell. It is usually returned as a dictionary.	`False`
`engine`	`str`	DataFrame engine. Should be either pandas or polars.	`'pandas'`
`**kwargs`	`Any`	Any other attributes of the cell, that can be accessed from openpyxl.	`{}`

Raises:

Type	Description
`ValueError`	If kwargs is provided, and one of the keys is a default column.
`AttributeError`	If kwargs is provided and any of the keys is not a openpyxl cell attribute.

Returns:

Type	Description
`Mapping`	A DataFrame, or a dictionary of DataFrames.

Source code in janitor/io.py

def xlsx_cells(
    path: Union[str, Workbook],
    sheetnames: Union[str, list, tuple] = None,
    start_point: Union[str, int] = None,
    end_point: Union[str, int] = None,
    read_only: bool = True,
    include_blank_cells: bool = True,
    fill: bool = False,
    font: bool = False,
    alignment: bool = False,
    border: bool = False,
    protection: bool = False,
    comment: bool = False,
    engine: str = "pandas",
    **kwargs: Any,
) -> Mapping:
    """Imports data from spreadsheet without coercing it into a rectangle.

    Each cell is represented by a row in a dataframe, and includes the
    cell's coordinates, the value, row and column position.
    The cell formatting (fill, font, border, etc) can also be accessed;
    usually this is returned as a dictionary in the cell, and the specific
    cell format attribute can be accessed using `pd.Series.str.get`
    or `pl.struct.field` if it is a polars DataFrame.

    Inspiration for this comes from R's [tidyxl][link] package.
    [link]: https://nacnudus.github.io/tidyxl/reference/tidyxl.html

    Examples:
        >>> import pandas as pd
        >>> import polars as pl
        >>> from janitor import xlsx_cells
        >>> pd.set_option("display.max_columns", None)
        >>> pd.set_option("display.expand_frame_repr", False)
        >>> pd.set_option("max_colwidth", None)
        >>> filename = "../pyjanitor/tests/test_data/worked-examples.xlsx"

        Each cell is returned as a row:

        >>> xlsx_cells(filename, sheetnames="highlights")
            value internal_value coordinate  row  column data_type  is_date number_format
        0     Age            Age         A1    1       1         s    False       General
        1  Height         Height         B1    1       2         s    False       General
        2       1              1         A2    2       1         n    False       General
        3       2              2         B2    2       2         n    False       General
        4       3              3         A3    3       1         n    False       General
        5       4              4         B3    3       2         n    False       General
        6       5              5         A4    4       1         n    False       General
        7       6              6         B4    4       2         n    False       General

        Access cell formatting such as fill:

        >>> out=xlsx_cells(filename, sheetnames="highlights", fill=True).select("value", "fill", axis='columns')
        >>> out
            value                                                                                                                                              fill
        0     Age     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
        1  Height     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
        2       1     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
        3       2     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
        4       3  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
        5       4  {'patternType': 'solid', 'fgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': 'FFFFFF00', 'type': 'rgb', 'tint': 0.0}}
        6       5     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}
        7       6     {'patternType': None, 'fgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}, 'bgColor': {'rgb': '00000000', 'type': 'rgb', 'tint': 0.0}}

        Specific cell attributes can be accessed by using Pandas' `series.str.get`:

        >>> out.fill.str.get("fgColor").str.get("rgb")
        0    00000000
        1    00000000
        2    00000000
        3    00000000
        4    FFFFFF00
        5    FFFFFF00
        6    00000000
        7    00000000
        Name: fill, dtype: object

        Access cell formatting in a polars DataFrame:

        >>> out = xlsx_cells(filename, sheetnames="highlights", engine='polars', fill=True).get_column('fill')
        >>> out
        shape: (8,)
        Series: 'fill' [struct[3]]
        [
           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
           {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
           {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
           {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
        ]

        Specific cell attributes can be acessed via Polars' struct:

        >>> out.struct.field('fgColor').struct.field('rgb')
        shape: (8,)
        Series: 'rgb' [str]
        [
           "00000000"
           "00000000"
           "00000000"
           "00000000"
           "FFFFFF00"
           "FFFFFF00"
           "00000000"
           "00000000"
        ]


    Args:
        path: Path to the Excel File. It can also be an openpyxl Workbook.
        sheetnames: Names of the sheets from which the cells are to be extracted.
            If `None`, all the sheets in the file are extracted;
            if it is a string, or list or tuple, only the specified sheets are extracted.
        start_point: Start coordinates of the Excel sheet. This is useful
            if the user is only interested in a subsection of the sheet.
            If `start_point` is provided, `end_point` must be provided as well.
        end_point: End coordinates of the Excel sheet. This is useful
            if the user is only interested in a subsection of the sheet.
            If `end_point` is provided, `start_point` must be provided as well.
        read_only: Determines if the entire file is loaded in memory,
            or streamed. For memory efficiency, read_only should be set to `True`.
            Some cell properties like `comment`, can only be accessed by
            setting `read_only` to `False`.
        include_blank_cells: Determines if cells without a value should be included.
        fill: If `True`, return fill properties of the cell.
            It is usually returned as a dictionary.
        font: If `True`, return font properties of the cell.
            It is usually returned as a dictionary.
        alignment: If `True`, return alignment properties of the cell.
            It is usually returned as a dictionary.
        border: If `True`, return border properties of the cell.
            It is usually returned as a dictionary.
        protection: If `True`, return protection properties of the cell.
            It is usually returned as a dictionary.
        comment: If `True`, return comment properties of the cell.
            It is usually returned as a dictionary.
        engine: DataFrame engine. Should be either pandas or polars.
        **kwargs: Any other attributes of the cell, that can be accessed from openpyxl.

    Raises:
        ValueError: If kwargs is provided, and one of the keys is a default column.
        AttributeError: If kwargs is provided and any of the keys
            is not a openpyxl cell attribute.

    Returns:
        A DataFrame, or a dictionary of DataFrames.
    """  # noqa : E501

    try:
        from openpyxl import load_workbook
        from openpyxl.cell.cell import Cell
        from openpyxl.cell.read_only import ReadOnlyCell
        from openpyxl.workbook.workbook import Workbook
    except ImportError:
        import_message(
            submodule="io",
            package="openpyxl",
            conda_channel="conda-forge",
            pip_install=True,
        )

    path_is_workbook = isinstance(path, Workbook)
    if not path_is_workbook:
        # for memory efficiency, read_only is set to True
        # if comments is True, read_only has to be False,
        # as lazy loading is not enabled for comments
        if comment and read_only:
            raise ValueError(
                "To access comments, kindly set 'read_only' to False."
            )
        path = load_workbook(
            filename=path, read_only=read_only, keep_links=False
        )
    if engine not in {"pandas", "polars"}:
        raise ValueError("engine should be one of pandas or polars.")
    base_engine = pd
    if engine == "polars":
        try:
            import polars as pl

            base_engine = pl
        except ImportError:
            import_message(
                submodule="polars",
                package="polars",
                conda_channel="conda-forge",
                pip_install=True,
            )
    # start_point and end_point applies if the user is interested in
    # only a subset of the Excel File and knows the coordinates
    if start_point or end_point:
        check("start_point", start_point, [str, int])
        check("end_point", end_point, [str, int])

    defaults = (
        "value",
        "internal_value",
        "coordinate",
        "row",
        "column",
        "data_type",
        "is_date",
        "number_format",
    )

    parameters = {
        "fill": fill,
        "font": font,
        "alignment": alignment,
        "border": border,
        "protection": protection,
        "comment": comment,
    }

    if kwargs:
        if path_is_workbook:
            if path.read_only:
                _cell = ReadOnlyCell
            else:
                _cell = Cell
        else:
            if read_only:
                _cell = ReadOnlyCell
            else:
                _cell = Cell

        attrs = {
            attr
            for attr, _ in inspect.getmembers(_cell, not (inspect.isroutine))
            if not attr.startswith("_")
        }

        for key in kwargs:
            if key in defaults:
                raise ValueError(
                    f"{key} is part of the default attributes "
                    "returned as a column."
                )
            elif key not in attrs:
                raise AttributeError(
                    f"{key} is not a recognized attribute of {_cell}."
                )
        parameters.update(kwargs)

    if not sheetnames:
        sheetnames = path.sheetnames
    elif isinstance(sheetnames, str):
        sheetnames = [sheetnames]
    else:
        check("sheetnames", sheetnames, [str, list, tuple])

    out = {
        sheetname: _xlsx_cells(
            path[sheetname],
            defaults,
            parameters,
            start_point,
            end_point,
            include_blank_cells,
            base_engine=base_engine,
        )
        for sheetname in sheetnames
    }
    if len(out) == 1:
        _, out = out.popitem()

    if (not path_is_workbook) and path.read_only:
        path.close()

    return out

`xlsx_table(path, sheetname=None, table=None, engine='pandas')`

Returns a DataFrame of values in a table in the Excel file.

This applies to an Excel file, where the data range is explicitly specified as a Microsoft Excel table.

If there is a single table in the sheet, or a string is provided as an argument to the table parameter, a DataFrame is returned; if there is more than one table in the sheet, and the table argument is None, or a list/tuple of names, a dictionary of DataFrames is returned, where the keys of the dictionary are the table names.

Examples:

>>> import pandas as pd
>>> import polars as pl
>>> from janitor import xlsx_table
>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"

Single table:

>>> xlsx_table(filename, table='dCategory')
   CategoryID       Category
0           1       Beginner
1           2       Advanced
2           3      Freestyle
3           4    Competition
4           5  Long Distance

>>> xlsx_table(filename, table='dCategory', engine='polars')
shape: (5, 2)
┌────────────┬───────────────┐
│ CategoryID ┆ Category      │
│ ---        ┆ ---           │
│ i64        ┆ str           │
╞════════════╪═══════════════╡
│ 1          ┆ Beginner      │
│ 2          ┆ Advanced      │
│ 3          ┆ Freestyle     │
│ 4          ┆ Competition   │
│ 5          ┆ Long Distance │
└────────────┴───────────────┘

Multiple tables:

>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
>>> out["dCategory"]
   CategoryID       Category
0           1       Beginner
1           2       Advanced
2           3      Freestyle
3           4    Competition
4           5  Long Distance
>>> out["dSalesReps"].head(3)
   SalesRepID             SalesRep Region
0           1  Sioux Radcoolinator     NW
1           2        Tyrone Smithe     NE
2           3         Chantel Zoya     SW

Parameters:

Name	Type	Description	Default
`path`	`Union[str, IO, Workbook]`	Path to the Excel File. It can also be an openpyxl Workbook.	required
`table`	`Union[str, list, tuple]`	Name of a table, or list of tables in the sheet.	`None`
`engine`	`str`	DataFrame engine. Should be either pandas or polars. Defaults to pandas	`'pandas'`

Raises:

Type	Description
`AttributeError`	If a workbook is provided, and is a ReadOnlyWorksheet.
`ValueError`	If there are no tables in the sheet.
`KeyError`	If the provided table does not exist in the sheet.

Returns:

Type	Description
`Mapping`	A DataFrame, or a dictionary of DataFrames, if there are multiple arguments for the `table` parameter, or the argument to `table` is `None`.

Source code in janitor/io.py

def xlsx_table(
    path: Union[str, IO, Workbook],
    sheetname: str = None,
    table: Union[str, list, tuple] = None,
    engine: str = "pandas",
) -> Mapping:
    """Returns a DataFrame of values in a table in the Excel file.

    This applies to an Excel file, where the data range is explicitly
    specified as a Microsoft Excel table.

    If there is a single table in the sheet, or a string is provided
    as an argument to the `table` parameter, a DataFrame is returned;
    if there is more than one table in the sheet,
    and the `table` argument is `None`, or a list/tuple of names,
    a dictionary of DataFrames is returned, where the keys of the dictionary
    are the table names.

    Examples:
        >>> import pandas as pd
        >>> import polars as pl
        >>> from janitor import xlsx_table
        >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"

        Single table:

        >>> xlsx_table(filename, table='dCategory')
           CategoryID       Category
        0           1       Beginner
        1           2       Advanced
        2           3      Freestyle
        3           4    Competition
        4           5  Long Distance

        >>> xlsx_table(filename, table='dCategory', engine='polars')
        shape: (5, 2)
        ┌────────────┬───────────────┐
        │ CategoryID ┆ Category      │
        │ ---        ┆ ---           │
        │ i64        ┆ str           │
        ╞════════════╪═══════════════╡
        │ 1          ┆ Beginner      │
        │ 2          ┆ Advanced      │
        │ 3          ┆ Freestyle     │
        │ 4          ┆ Competition   │
        │ 5          ┆ Long Distance │
        └────────────┴───────────────┘

        Multiple tables:

        >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
        >>> out["dCategory"]
           CategoryID       Category
        0           1       Beginner
        1           2       Advanced
        2           3      Freestyle
        3           4    Competition
        4           5  Long Distance
        >>> out["dSalesReps"].head(3)
           SalesRepID             SalesRep Region
        0           1  Sioux Radcoolinator     NW
        1           2        Tyrone Smithe     NE
        2           3         Chantel Zoya     SW

    Args:
        path: Path to the Excel File. It can also be an openpyxl Workbook.
        table: Name of a table, or list of tables in the sheet.
        engine: DataFrame engine. Should be either pandas or polars.
            Defaults to pandas

    Raises:
        AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
        ValueError: If there are no tables in the sheet.
        KeyError: If the provided table does not exist in the sheet.

    Returns:
        A DataFrame, or a dictionary of DataFrames,
            if there are multiple arguments for the `table` parameter,
            or the argument to `table` is `None`.
    """  # noqa : E501

    try:
        from openpyxl import load_workbook
        from openpyxl.workbook.workbook import Workbook
    except ImportError:
        import_message(
            submodule="io",
            package="openpyxl",
            conda_channel="conda-forge",
            pip_install=True,
        )
    # TODO: remove in version 1.0
    if sheetname:
        warnings.warn(
            "The keyword argument "
            "'sheetname' of 'xlsx_tables' is deprecated.",
            DeprecationWarning,
            stacklevel=find_stack_level(),
        )
    if engine not in {"pandas", "polars"}:
        raise ValueError("engine should be one of pandas or polars.")
    base_engine = pd
    if engine == "polars":
        try:
            import polars as pl

            base_engine = pl
        except ImportError:
            import_message(
                submodule="polars",
                package="polars",
                conda_channel="conda-forge",
                pip_install=True,
            )

    if table is not None:
        check("table", table, [str, list, tuple])
        if isinstance(table, (list, tuple)):
            for num, entry in enumerate(table):
                check(f"entry{num} in the table argument", entry, [str])
    if isinstance(path, Workbook):
        ws = path
    else:
        ws = load_workbook(
            filename=path, read_only=False, keep_links=False, data_only=True
        )
    if ws.read_only:
        raise ValueError("xlsx_table does not work in read only mode.")

    def _create_dataframe_or_dictionary_from_table(
        table_name_and_worksheet: tuple,
    ):
        """
        Create DataFrame/dictionary if table exists in Workbook
        """
        dictionary = {}
        for table_name, worksheet in table_name_and_worksheet:
            contents = worksheet.tables[table_name]
            header_exist = contents.headerRowCount
            coordinates = contents.ref
            data = worksheet[coordinates]
            if header_exist:
                header, *data = data
                header = [cell.value for cell in header]
            else:
                header = [f"C{num}" for num in range(len(data[0]))]
            data = zip(*data)
            data = ([entry.value for entry in cell] for cell in data)
            data = dict(zip(header, data))
            dictionary[table_name] = base_engine.DataFrame(data)
        return dictionary

    worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]
    if not any(worksheets):
        raise ValueError("There are no tables in the Workbook.")
    table_is_a_string = False
    if table:
        if isinstance(table, str):
            table_is_a_string = True
            table = [table]
        table_names = (
            entry for worksheet in worksheets for entry in worksheet.tables
        )
        missing = set(table).difference(table_names)
        if missing:
            raise KeyError(f"Tables {*missing,} do not exist in the Workbook.")
        tables = [
            (entry, worksheet)
            for worksheet in worksheets
            for entry in worksheet.tables
            if entry in table
        ]
    else:
        tables = [
            (entry, worksheet)
            for worksheet in worksheets
            for entry in worksheet.tables
        ]
    data = _create_dataframe_or_dictionary_from_table(
        table_name_and_worksheet=tables
    )
    if table_is_a_string:
        return data[table[0]]
    return data