Skip to content

Binder

Pivot Data from Long to Wide Form

import janitor as jn
import pandas as pd
df = [
    {"name": "Alice", "variable": "wk1", "value": 5},
    {"name": "Alice", "variable": "wk2", "value": 9},
    {"name": "Alice", "variable": "wk3", "value": 20},
    {"name": "Alice", "variable": "wk4", "value": 22},
    {"name": "Bob", "variable": "wk1", "value": 7},
    {"name": "Bob", "variable": "wk2", "value": 11},
    {"name": "Bob", "variable": "wk3", "value": 17},
    {"name": "Bob", "variable": "wk4", "value": 33},
    {"name": "Carla", "variable": "wk1", "value": 6},
    {"name": "Carla", "variable": "wk2", "value": 13},
    {"name": "Carla", "variable": "wk3", "value": 39},
    {"name": "Carla", "variable": "wk4", "value": 40},
]


df = pd.DataFrame(df)

df
name variable value
0 Alice wk1 5
1 Alice wk2 9
2 Alice wk3 20
3 Alice wk4 22
4 Bob wk1 7
5 Bob wk2 11
6 Bob wk3 17
7 Bob wk4 33
8 Carla wk1 6
9 Carla wk2 13
10 Carla wk3 39
11 Carla wk4 40

Reshaping to wide form:

df.pivot_wider(index="name", names_from="variable", values_from="value")
name wk1 wk2 wk3 wk4
0 Alice 5 9 20 22
1 Bob 7 11 17 33
2 Carla 6 13 39 40

Pivoting on multiple columns is possible :

df = [
    {"name": 1, "n": 10.0, "pct": 0.1},
    {"name": 2, "n": 20.0, "pct": 0.2},
    {"name": 3, "n": 30.0, "pct": 0.3},
]


df = pd.DataFrame(df)

df
name n pct
0 1 10.0 0.1
1 2 20.0 0.2
2 3 30.0 0.3
(
    df.assign(num=0).pivot_wider(
        index="num", names_from="name", values_from=["n", "pct"], names_sep="_"
    )
)
num n_1 n_2 n_3 pct_1 pct_2 pct_3
0 0 10.0 20.0 30.0 0.1 0.2 0.3

You may choose not to flatten the columns, by setting flatten_levels to False:

df = [
    {"dep": 5.5, "step": 1, "a": 20, "b": 30},
    {"dep": 5.5, "step": 2, "a": 25, "b": 37},
    {"dep": 6.1, "step": 1, "a": 22, "b": 19},
    {"dep": 6.1, "step": 2, "a": 18, "b": 29},
]


df = pd.DataFrame(df)

df
dep step a b
0 5.5 1 20 30
1 5.5 2 25 37
2 6.1 1 22 19
3 6.1 2 18 29
df.pivot_wider(index="dep", names_from="step", flatten_levels=False)
a b
step 1 2 1 2
dep
5.5 20 25 30 37
6.1 22 18 19 29

The order of the levels can be changed with the levels_order parameter, which internally uses pandas' reorder_levels:

df.pivot_wider(
    index="dep", names_from="step", flatten_levels=False, levels_order=["step", None]
)
step 1 2 1 2
a a b b
dep
5.5 20 25 30 37
6.1 22 18 19 29
df.pivot_wider(
    index="dep",
    names_from="step",
    flatten_levels=True,
)
dep a_1 a_2 b_1 b_2
0 5.5 20 25 30 37
1 6.1 22 18 19 29
df.pivot_wider(
    index="dep", names_from="step", flatten_levels=True, levels_order=["step", None]
)
dep 1_a 2_a 1_b 2_b
0 5.5 20 25 30 37
1 6.1 22 18 19 29

names_sep and names_glue come in handy in situations where names_from and/or values_from contain multiple variables; it is used primarily when the columns are flattened. The default value for names_sep is _:

# default value of names_sep is '_'
df.pivot_wider(index="dep", names_from="step")
dep a_1 a_2 b_1 b_2
0 5.5 20 25 30 37
1 6.1 22 18 19 29
df.pivot_wider(index="dep", names_from="step", names_sep="")
dep a1 a2 b1 b2
0 5.5 20 25 30 37
1 6.1 22 18 19 29

With names_glue you can glue the individual levels (if MultiIndex) into one (similar to names_sep), or you can modify the final columns, as long as it can be passed to pd.Index.map:

# replicate `names_sep`
df.pivot_wider(index="dep", names_from="step", names_sep=None, names_glue="_".join)
dep a_1 a_2 b_1 b_2
0 5.5 20 25 30 37
1 6.1 22 18 19 29
# going beyond names_sep
df.pivot_wider(
    index="dep",
    names_from="step",
    names_sep=None,
    names_glue=lambda col: f"{col[0]}_step{col[1]}",
)
dep a_step1 a_step2 b_step1 b_step2
0 5.5 20 25 30 37
1 6.1 22 18 19 29

There are scenarios where the column order of the final dataframe is important:

df = [
    {"Salesman": "Knut", "Height": 6, "product": "bat", "price": 5},
    {"Salesman": "Knut", "Height": 6, "product": "ball", "price": 1},
    {"Salesman": "Knut", "Height": 6, "product": "wand", "price": 3},
    {"Salesman": "Steve", "Height": 5, "product": "pen", "price": 2},
]

df = pd.DataFrame(df)

df
Salesman Height product price
0 Knut 6 bat 5
1 Knut 6 ball 1
2 Knut 6 wand 3
3 Steve 5 pen 2
idx = df.groupby(["Salesman", "Height"]).cumcount().add(1)

(df.assign(idx=idx).pivot_wider(index=["Salesman", "Height"], names_from="idx"))
Salesman Height product_1 product_2 product_3 price_1 price_2 price_3
0 Knut 6 bat ball wand 5.0 1.0 3.0
1 Steve 5 pen NaN NaN 2.0 NaN NaN

To get the columns in a form where product alternates with price, we can combine pivot_wider (or plain pd.pivot) with pd.sort_index and janitor.collapse_levels:

(
    df.assign(idx=idx)
    .pivot_wider(index=["Salesman", "Height"], names_from="idx", flatten_levels=False)
    .sort_index(level="idx", axis="columns", sort_remaining=False)
    .collapse_levels()
    .reset_index()
)
Salesman Height product_1 price_1 product_2 price_2 product_3 price_3
0 Knut 6 bat 5.0 ball 1.0 wand 3.0
1 Steve 5 pen 2.0 NaN NaN NaN NaN
df = pd.DataFrame(
    {
        "geoid": [1, 1, 13, 13],
        "name": ["Alabama", "Alabama", "Georgia", "Georgia"],
        "variable": [
            "pop_renter",
            "median_rent",
            "pop_renter",
            "median_rent",
        ],
        "estimate": [1434765, 747, 3592422, 927],
        "error": [16736, 3, 33385, 3],
    }
)

df
geoid name variable estimate error
0 1 Alabama pop_renter 1434765 16736
1 1 Alabama median_rent 747 3
2 13 Georgia pop_renter 3592422 33385
3 13 Georgia median_rent 927 3
df.pivot_wider(
    index=["geoid", "name"],
    names_from="variable",
    values_from=["estimate", "error"],
    levels_order=["variable", None],
)
geoid name median_rent_estimate pop_renter_estimate median_rent_error pop_renter_error
0 1 Alabama 747 1434765 3 16736
1 13 Georgia 927 3592422 3 33385

For the reshaping above, we would like to maintain the order in variable, where pop_renter comes before median_rent; this can be achieved by converting the variable column to a categorical, before reshaping:

(
    df.encode_categorical(variable=(None, "appearance")).pivot_wider(
        index=["geoid", "name"],
        names_from="variable",
        values_from=["estimate", "error"],
        levels_order=["variable", None],
    )
)
geoid name pop_renter_estimate median_rent_estimate pop_renter_error median_rent_error
0 1 Alabama 1434765 747 16736 3
1 13 Georgia 3592422 927 33385 3