import pandas as pdSeries
df = pd.DataFrame({
"Date": [
"2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
"2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
"Data": [5, 8, 6, 1, 50, 100, 60, 120],
})def makechat(row):
row['chat'] = f"{row.Date} is the date"
return rowdf.transform(makechat, axis=1)--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[5], line 1 ----> 1 df.transform(makechat, axis=1) File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/frame.py:10139, in DataFrame.transform(self, func, axis, *args, **kwargs) 10136 from pandas.core.apply import frame_apply 10138 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) > 10139 result = op.transform() 10140 assert isinstance(result, DataFrame) 10141 return result File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/apply.py:229, in Apply.transform(self) 227 if obj._get_axis_number(axis) == 1: 228 assert not is_series --> 229 return obj.T.transform(func, 0, *args, **kwargs).T 231 if is_list_like(func) and not is_dict_like(func): 232 func = cast(list[AggFuncTypeBase], func) File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/frame.py:10139, in DataFrame.transform(self, func, axis, *args, **kwargs) 10136 from pandas.core.apply import frame_apply 10138 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) > 10139 result = op.transform() 10140 assert isinstance(result, DataFrame) 10141 return result File ~/opt/miniconda3/envs/ThinkBayes2/lib/python3.11/site-packages/pandas/core/apply.py:267, in Apply.transform(self) 260 # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type 261 # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy, 262 # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame, 263 # Series]" 264 if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( 265 obj.index # type: ignore[arg-type] 266 ): --> 267 raise ValueError("Function did not transform") 269 return result ValueError: Function did not transform
Can be considered as a type of dictionary, or a generalised 1D NumPy array.
data = pd.Series([0.25, 0.5, 0.375, 1.0], index = [2,5,3,7])data[5]0.5
Series(data, index). data can be a scalar (which is then repeated as many times as there are indices), a list or numpy array, in which case the index defaults to an integer (0-based) sequence, or a dictionary, in which case index defaults to the dictionary keys.
pd.Series([2,4,6])0 2
1 4
2 6
dtype: int64
pd.Series(5, index=[100, 200, 300])100 5
200 5
300 5
dtype: int64
s = pd.Series({2:'a', 1:'b', 3:'c'})s[1]'b'
In each case the index can be explitly set to control the order, or the subset of keys used. eg here we don’t use everything in the dictionary and we specify the order.
pd.Series({2:'a', 1:'b', 3:'c'}, index=[1,2])1 b
2 a
dtype: object
DataFrames
Can be considered as either a specialization of a dictionary or a generalization of a NumPy array, or as a sequence of aligned Series. ‘aligned’ means sharing the same index.
population_dict = {'California': 39538223, 'Texas': 29145505,
'Florida': 21538187, 'New York': 20201249,
'Pennsylvania': 13002700}
population = pd.Series(population_dict)
populationCalifornia 39538223
Texas 29145505
Florida 21538187
New York 20201249
Pennsylvania 13002700
dtype: int64
area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
'New York': 141297, 'Pennsylvania': 119280}
area = pd.Series(area_dict)
areaCalifornia 423967
Texas 695662
Florida 170312
New York 141297
Pennsylvania 119280
dtype: int64
states = pd.DataFrame({'population': population, 'area': area})
states| population | area | |
|---|---|---|
| California | 39538223 | 423967 |
| Texas | 29145505 | 695662 |
| Florida | 21538187 | 170312 |
| New York | 20201249 | 141297 |
| Pennsylvania | 13002700 | 119280 |
Dataframe can be thought of as a generalisation of a 2D NumPy array, where both the rows and columns have a generalised index for accessing the data.
states.columnsIndex(['population', 'area'], dtype='object')
states.indexIndex(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')
states['area']California 423967
Texas 695662
Florida 170312
New York 141297
Pennsylvania 119280
Name: area, dtype: int64
For a 2d NumPy array, data[0] will return the first row. For a DataFrame, data['col0'] will return the first column
# from a list of dicts
data = [{'a': i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)| a | b | |
|---|---|---|
| 0 | 0 | 0 |
| 1 | 1 | 2 |
| 2 | 2 | 4 |