Pivot Data from Long to Wide Form

import janitor as jn
import pandas as pd

df = [
    {"name": "Alice", "variable": "wk1", "value": 5},
    {"name": "Alice", "variable": "wk2", "value": 9},
    {"name": "Alice", "variable": "wk3", "value": 20},
    {"name": "Alice", "variable": "wk4", "value": 22},
    {"name": "Bob", "variable": "wk1", "value": 7},
    {"name": "Bob", "variable": "wk2", "value": 11},
    {"name": "Bob", "variable": "wk3", "value": 17},
    {"name": "Bob", "variable": "wk4", "value": 33},
    {"name": "Carla", "variable": "wk1", "value": 6},
    {"name": "Carla", "variable": "wk2", "value": 13},
    {"name": "Carla", "variable": "wk3", "value": 39},
    {"name": "Carla", "variable": "wk4", "value": 40},
]


df = pd.DataFrame(df)

df

	name	variable	value
0	Alice	wk1	5
1	Alice	wk2	9
2	Alice	wk3	20
3	Alice	wk4	22
4	Bob	wk1	7
5	Bob	wk2	11
6	Bob	wk3	17
7	Bob	wk4	33
8	Carla	wk1	6
9	Carla	wk2	13
10	Carla	wk3	39
11	Carla	wk4	40

Reshaping to wide form:

df.pivot_wider(index="name", names_from="variable", values_from="value")

	name	wk1	wk2	wk3	wk4
0	Alice	5	9	20	22
1	Bob	7	11	17	33
2	Carla	6	13	39	40

Pivoting on multiple columns is possible :

df = [
    {"name": 1, "n": 10.0, "pct": 0.1},
    {"name": 2, "n": 20.0, "pct": 0.2},
    {"name": 3, "n": 30.0, "pct": 0.3},
]


df = pd.DataFrame(df)

df

	name	n	pct
0	1	10.0	0.1
1	2	20.0	0.2
2	3	30.0	0.3

(
    df.assign(num=0).pivot_wider(
        index="num", names_from="name", values_from=["n", "pct"], names_sep="_"
    )
)

	num	n_1	n_2	n_3	pct_1	pct_2	pct_3
0	0	10.0	20.0	30.0	0.1	0.2	0.3

You may choose not to flatten the columns, by setting flatten_levels to False:

df = [
    {"dep": 5.5, "step": 1, "a": 20, "b": 30},
    {"dep": 5.5, "step": 2, "a": 25, "b": 37},
    {"dep": 6.1, "step": 1, "a": 22, "b": 19},
    {"dep": 6.1, "step": 2, "a": 18, "b": 29},
]


df = pd.DataFrame(df)

df

	dep	step	a	b
0	5.5	1	20	30
1	5.5	2	25	37
2	6.1	1	22	19
3	6.1	2	18	29

df.pivot_wider(index="dep", names_from="step", flatten_levels=False)

	a		b
step	1	2	1	2
dep
5.5	20	25	30	37
6.1	22	18	19	29

The order of the levels can be changed with the levels_order parameter, which internally uses pandas' reorder_levels:

df.pivot_wider(
    index="dep", names_from="step", flatten_levels=False, levels_order=["step", None]
)

step	1	2	1	2
	a	a	b	b
dep
5.5	20	25	30	37
6.1	22	18	19	29

df.pivot_wider(
    index="dep",
    names_from="step",
    flatten_levels=True,
)

	dep	a_1	a_2	b_1	b_2
0	5.5	20	25	30	37
1	6.1	22	18	19	29

df.pivot_wider(
    index="dep", names_from="step", flatten_levels=True, levels_order=["step", None]
)

	dep	1_a	2_a	1_b	2_b
0	5.5	20	25	30	37
1	6.1	22	18	19	29

names_sep and names_glue come in handy in situations where names_from and/or values_from contain multiple variables; it is used primarily when the columns are flattened. The default value for names_sep is _:

# default value of names_sep is '_'
df.pivot_wider(index="dep", names_from="step")

	dep	a_1	a_2	b_1	b_2
0	5.5	20	25	30	37
1	6.1	22	18	19	29

df.pivot_wider(index="dep", names_from="step", names_sep="")

	dep	a1	a2	b1	b2
0	5.5	20	25	30	37
1	6.1	22	18	19	29

With names_glue you can glue the individual levels (if MultiIndex) into one (similar to names_sep), or you can modify the final columns, as long as it can be passed to pd.Index.map:

# replicate `names_sep`
df.pivot_wider(index="dep", names_from="step", names_sep=None, names_glue="_".join)

	dep	a_1	a_2	b_1	b_2
0	5.5	20	25	30	37
1	6.1	22	18	19	29

# going beyond names_sep
df.pivot_wider(
    index="dep",
    names_from="step",
    names_sep=None,
    names_glue=lambda col: f"{col[0]}_step{col[1]}",
)

	dep	a_step1	a_step2	b_step1	b_step2
0	5.5	20	25	30	37
1	6.1	22	18	19	29

There are scenarios where the column order of the final dataframe is important:

df = [
    {"Salesman": "Knut", "Height": 6, "product": "bat", "price": 5},
    {"Salesman": "Knut", "Height": 6, "product": "ball", "price": 1},
    {"Salesman": "Knut", "Height": 6, "product": "wand", "price": 3},
    {"Salesman": "Steve", "Height": 5, "product": "pen", "price": 2},
]

df = pd.DataFrame(df)

df

	Salesman	Height	product	price
0	Knut	6	bat	5
1	Knut	6	ball	1
2	Knut	6	wand	3
3	Steve	5	pen	2

idx = df.groupby(["Salesman", "Height"]).cumcount().add(1)

(df.assign(idx=idx).pivot_wider(index=["Salesman", "Height"], names_from="idx"))

	Salesman	Height	product_1	product_2	product_3	price_1	price_2	price_3
0	Knut	6	bat	ball	wand	5.0	1.0	3.0
1	Steve	5	pen	NaN	NaN	2.0	NaN	NaN

To get the columns in a form where product alternates with price, we can combine pivot_wider (or plain pd.pivot) with pd.sort_index and janitor.collapse_levels:

(
    df.assign(idx=idx)
    .pivot_wider(index=["Salesman", "Height"], names_from="idx", flatten_levels=False)
    .sort_index(level="idx", axis="columns", sort_remaining=False)
    .collapse_levels()
    .reset_index()
)

	Salesman	Height	product_1	price_1	product_2	price_2	product_3	price_3
0	Knut	6	bat	5.0	ball	1.0	wand	3.0
1	Steve	5	pen	2.0	NaN	NaN	NaN	NaN

df = pd.DataFrame(
    {
        "geoid": [1, 1, 13, 13],
        "name": ["Alabama", "Alabama", "Georgia", "Georgia"],
        "variable": [
            "pop_renter",
            "median_rent",
            "pop_renter",
            "median_rent",
        ],
        "estimate": [1434765, 747, 3592422, 927],
        "error": [16736, 3, 33385, 3],
    }
)

df

	geoid	name	variable	estimate	error
0	1	Alabama	pop_renter	1434765	16736
1	1	Alabama	median_rent	747	3
2	13	Georgia	pop_renter	3592422	33385
3	13	Georgia	median_rent	927	3

df.pivot_wider(
    index=["geoid", "name"],
    names_from="variable",
    values_from=["estimate", "error"],
    levels_order=["variable", None],
)

	geoid	name	median_rent_estimate	pop_renter_estimate	median_rent_error	pop_renter_error
0	1	Alabama	747	1434765	3	16736
1	13	Georgia	927	3592422	3	33385

For the reshaping above, we would like to maintain the order in variable, where pop_renter comes before median_rent; this can be achieved by converting the variable column to a categorical, before reshaping:

(
    df.encode_categorical(variable=(None, "appearance")).pivot_wider(
        index=["geoid", "name"],
        names_from="variable",
        values_from=["estimate", "error"],
        levels_order=["variable", None],
    )
)

	geoid	name	pop_renter_estimate	median_rent_estimate	pop_renter_error	median_rent_error
0	1	Alabama	1434765	747	16736	3
1	13	Georgia	3592422	927	33385	3