Series

import pandas as pd
df = pd.DataFrame({
    "Date": [
        "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
        "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
    "Data": [5, 8, 6, 1, 50, 100, 60, 120],
})
def makechat(row):
    row['chat'] = f"{row.Date} is the date"
    return row
df.transform(makechat, axis=1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 1
----> 1 df.transform(makechat, axis=1)

File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/frame.py:10139, in DataFrame.transform(self, func, axis, *args, **kwargs)
  10136 from pandas.core.apply import frame_apply
  10138 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
> 10139 result = op.transform()
  10140 assert isinstance(result, DataFrame)
  10141 return result

File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/apply.py:229, in Apply.transform(self)
    227 if obj._get_axis_number(axis) == 1:
    228     assert not is_series
--> 229     return obj.T.transform(func, 0, *args, **kwargs).T
    231 if is_list_like(func) and not is_dict_like(func):
    232     func = cast(list[AggFuncTypeBase], func)

File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/frame.py:10139, in DataFrame.transform(self, func, axis, *args, **kwargs)
  10136 from pandas.core.apply import frame_apply
  10138 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
> 10139 result = op.transform()
  10140 assert isinstance(result, DataFrame)
  10141 return result

File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/apply.py:267, in Apply.transform(self)
    260 # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
    261 # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
    262 # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
    263 # Series]"
    264 if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
    265     obj.index  # type: ignore[arg-type]
    266 ):
--> 267     raise ValueError("Function did not transform")
    269 return result

ValueError: Function did not transform

Can be considered as a type of dictionary, or a generalised 1D NumPy array.

data = pd.Series([0.25, 0.5, 0.375, 1.0], index = [2,5,3,7])
data[5]
0.5

Series(data, index). data can be a scalar (which is then repeated as many times as there are indices), a list or numpy array, in which case the index defaults to an integer (0-based) sequence, or a dictionary, in which case index defaults to the dictionary keys.

pd.Series([2,4,6])
0    2
1    4
2    6
dtype: int64
pd.Series(5, index=[100, 200, 300])
100    5
200    5
300    5
dtype: int64
s = pd.Series({2:'a', 1:'b', 3:'c'})
s[1]
'b'

In each case the index can be explitly set to control the order, or the subset of keys used. eg here we don’t use everything in the dictionary and we specify the order.

pd.Series({2:'a', 1:'b', 3:'c'}, index=[1,2])
1    b
2    a
dtype: object

DataFrames

Can be considered as either a specialization of a dictionary or a generalization of a NumPy array, or as a sequence of aligned Series. ‘aligned’ means sharing the same index.

population_dict = {'California': 39538223, 'Texas': 29145505,
                            'Florida': 21538187, 'New York': 20201249,
                            'Pennsylvania': 13002700}
population = pd.Series(population_dict)
population
California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64
area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
                      'New York': 141297, 'Pennsylvania': 119280}
area = pd.Series(area_dict)
area
California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64
states = pd.DataFrame({'population': population, 'area': area})
states
population area
California 39538223 423967
Texas 29145505 695662
Florida 21538187 170312
New York 20201249 141297
Pennsylvania 13002700 119280

Dataframe can be thought of as a generalisation of a 2D NumPy array, where both the rows and columns have a generalised index for accessing the data.

states.columns
Index(['population', 'area'], dtype='object')
states.index
Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')
states['area']
California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

For a 2d NumPy array, data[0] will return the first row. For a DataFrame, data['col0'] will return the first column

# from a list of dicts
data = [{'a': i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)
a b
0 0 0
1 1 2
2 2 4