import pandas as pdWelcome welcome: Comparing the code queries used in two popular data manipulation libraries: Pandas in Python and dplyr in R.
Written on July 4, 2023
Load library
Loading pandas
Loading dplyr
library(dplyr)Create data frame
data_sales = {
'group': ['A', 'B', 'C', 'A'],
'sales': [2000, 1500, 3000, 2500],
'gender': ['male', 'female',
'male', 'male'],
'age': [25, 30, 35, 32]
}
# Create the DataFrame
df = pd.DataFrame(data_sales)df <- tibble(
group = c("A", "B", "C", "A"),
sales = c(2000, 1500, 3000, 2500),
gender = c("male", "female",
"male", "male"),
age = c(25, 30, 35, 32)
)Select column(s)
df[["group", "gender"]]df |>
select(group, gender)Filter
df.query('sales >= 2500')df |>
filter(sales >= 2500)Rename column
df.rename(
columns={'group': 'working_class',
'sales': 'product_sales'})df |>
rename(working_class = group,
product_sales = sales)Create a variable
df['sales_times_2'] = df['sales'] * 2df |>
mutate(sales_times_2 = sales * 2)Modify a variable
df['sales'] = df['sales'] * 2df |>
mutate(sales = sales * 2)Summmarization
(
df.groupby(['group'])
.agg({'sales': 'sum'})
.reset_index()
)df |>
group_by(group) |>
summarise(sales = sum(sales))Sort
df.sort_values('sales')df |>
arrange(sales)Sort (Descending order)
df.sort_values('sales',
ascending=False)df |>
arrange(desc(sales))