In [1]:
!pip install psycopg2-binary
import psycopg2
from google.colab import userdata
password_post = userdata.get('postpass')
conn = psycopg2.connect(
host="8.tcp.ngrok.io",
port=16178,
database="test_db",
user="postgres",
password=password_post
)

cursor = conn.cursor()

cursor.execute("SELECT version();")

print(cursor.fetchone())
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.2/4.2 MB 27.2 MB/s eta 0:00:00
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.11
('PostgreSQL 18.1 on x86_64-windows, compiled by msvc-19.44.35221, 64-bit',)
In [ ]:
conn.rollback()

cursor.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'e-commerce-project';
""")

print(cursor.fetchall())
[('olist_products_dataset',), ('olist_sellers_dataset',), ('product_category_name_translation',), ('olist_order_reviews_dataset',), ('olist_geolocation_dataset',), ('olist_order_items_dataset',), ('olist_order_payments_dataset',), ('olist_customers_dataset',), ('olist_orders_dataset',)]
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install plotly

import plotly.express as px
Requirement already satisfied: plotly in /usr/local/lib/python3.12/dist-packages (5.24.1)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.12/dist-packages (from plotly) (9.1.4)
Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from plotly) (26.0)

What are the top-selling product categories?¶

In [ ]:
query_1 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_items = pd.read_sql(query_1, conn)
df_items
/tmp/ipykernel_2257/425679890.py:6: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 09:45:35+00:00 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 11:05:13+00:00 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 14:48:30+00:00 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 10:10:18+00:00 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 13:57:51+00:00 199.90 18.14
In [ ]:
query_2 = """
SELECT *
FROM "e-commerce-project".olist_products_dataset
LIMIT 5
"""
df_products = pd.read_sql(query_2, conn)
df_products
/tmp/ipykernel_2257/4179090135.py:6: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40 287 1 225 16 10 14
1 3aa071139cb16b67ca9e5dea641aaa2f artes 44 276 1 1000 30 18 20
2 96bd76ec8810374ed1b65e291975717f esporte_lazer 46 250 1 154 18 9 15
3 cef67bcfe19066a932b7673e239eb23d bebes 27 261 1 371 26 4 26
4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas 37 402 4 625 20 17 13
In [ ]:
query_3 = """
SELECT *
FROM "e-commerce-project".product_category_name_translation
LIMIT 5
"""
df_trans = pd.read_sql(query_3, conn)
df_trans
/tmp/ipykernel_2257/2897695490.py:6: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name product_category_name_english
0 beleza_saude health_beauty
1 informatica_acessorios computers_accessories
2 automotivo auto
3 cama_mesa_banho bed_bath_table
4 moveis_decoracao furniture_decor
In [ ]:
conn.rollback()
query_4 = """
SELECT t.product_category_name_english, COUNT(order_id) AS total_orders
FROM "e-commerce-project".olist_order_items_dataset o
JOIN "e-commerce-project".olist_products_dataset p
ON o.product_id = p.product_id
JOIN "e-commerce-project".product_category_name_translation t
ON p.product_category_name = t.product_category_name
GROUP BY t.product_category_name_english
ORDER BY total_orders DESC
LIMIT 10
"""
df_q1 = pd.read_sql(query_4, conn)
df_q1
/tmp/ipykernel_2257/297876490.py:13: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name_english total_orders
0 bed_bath_table 11115
1 health_beauty 9670
2 sports_leisure 8641
3 furniture_decor 8334
4 computers_accessories 7827
5 housewares 6964
6 watches_gifts 5991
7 telephony 4545
8 garden_tools 4347
9 auto 4235
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="total_orders", y="product_category_name_english", data=df_q1, palette="viridis")
plt.title("Top 10 Product Categories")
plt.xlabel("Total Orders")
plt.ylabel("Product category")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/2842852274.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image
In [ ]:
conn.rollback()
query_5 = """
SELECT t.product_category_name_english, SUM(o.price) AS revenue
FROM "e-commerce-project".olist_order_items_dataset AS o
INNER JOIN "e-commerce-project".olist_products_dataset AS p
ON o.product_id = p.product_id
INNER JOIN "e-commerce-project".product_category_name_translation AS t
ON p.product_category_name = t.product_category_name
GROUP BY t.product_category_name_english
ORDER BY revenue DESC
LIMIT 10
"""
df_q2 = pd.read_sql(query_5, conn)
df_q2
/tmp/ipykernel_2257/187717851.py:13: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name_english revenue
0 health_beauty 1258681.34
1 watches_gifts 1205005.68
2 bed_bath_table 1036988.68
3 sports_leisure 988048.97
4 computers_accessories 911954.32
5 furniture_decor 729762.49
6 cool_stuff 635290.85
7 housewares 632248.66
8 auto 592720.11
9 garden_tools 485256.46
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="product_category_name_english", data=df_q2, palette="magma")
plt.title("Total revenue per category")
plt.xlabel("Revenue")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/2624280649.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image
In [ ]:
conn.rollback()
query_6 = """
SELECT t.product_category_name_english, SUM(o.price + o.freight_value) AS revenue
FROM "e-commerce-project".olist_order_items_dataset AS o
INNER JOIN "e-commerce-project".olist_products_dataset AS p
ON o.product_id = p.product_id
INNER JOIN "e-commerce-project".product_category_name_translation AS t
ON p.product_category_name = t.product_category_name
GROUP BY t.product_category_name_english
ORDER BY revenue DESC
LIMIT 10
"""
df_q3 = pd.read_sql(query_6, conn)
df_q3
/tmp/ipykernel_2257/1506463664.py:13: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name_english revenue
0 health_beauty 1441248.07
1 watches_gifts 1305541.61
2 bed_bath_table 1241681.72
3 sports_leisure 1156656.48
4 computers_accessories 1059272.40
5 furniture_decor 902511.79
6 housewares 778397.77
7 cool_stuff 719329.95
8 auto 685384.32
9 garden_tools 584219.21
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="product_category_name_english", data=df_q3, palette="magma")
plt.title("Total revenue per category")
plt.xlabel("Revenue")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/3599276967.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image

Revenue Leadership: Health & Beauty

  • Health & Beauty stands as the most profitable category with a total revenue of $1,258,681.34
  • This suggests a high consumer demand coupled with a strong price-per-unit ratio, making it the "star" category of the ecosystem.

High-Ticket Strategy: Watches & Gifts

  • A major insight is found in Watches & Gifts. While it ranks lower in total order volume (7th place), it ranks 2nd in total revenue.
  • This category relies on high-ticket items. It requires fewer sales to generate massive revenue, indicating a premium market segment.

Volume vs. Value: Bed, Bath & Table

  • Bed, Bath & Table is the absolute leader in quantity of orders (over 11,000). However, it drops to 3rd place in revenue.
  • This indicates that shipping costs represent a significant portion of the total transaction value for these categories, likely due to their weight or size.

Logistic Impact

  • By comparing we notice that categories like Housewares and Garden Tools lose ground when freight is removed.
  • This indicates that shipping costs represent a significant portion of the total transaction value for these categories, likely due to their weight or size.

Business Recommendations¶

Marketing Focus: Allocate more budget to Health & Beauty and Watches & Gifts to maximize Return on Ad Spend (ROAS).

Retention: Since Bed, Bath & Table has the most orders, use it as a "hook" for loyalty programs or cross-selling other products.

Logistics: Review shipping contracts for Housewares to reduce the "Freight Gap" and make those products more competitive.


Which states generate the highest revenue?¶

In [ ]:
conn.rollback()
query_7 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""
df_q4 = pd.read_sql(query_7, conn)
df_q4
/tmp/ipykernel_2257/31892081.py:7: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id customer_id order_status order_purchase_timestamp order_approved_atorder_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26
In [ ]:
conn.rollback()
query_8 = """
SELECT *
FROM "e-commerce-project".olist_customers_dataset
LIMIT 5
"""

df_q5 = pd.read_sql(query_8, conn)
df_q5
/tmp/ipykernel_2257/3662583808.py:8: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
customer_id customer_unique_id customer_city customer_state customer_zip_code_prefix
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 franca SP 14409
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 sao bernardo do campo SP 09790
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e sao paulo SP 01151
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c mogi das cruzes SP 08775
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 campinas SP 13056
In [ ]:
conn.rollback()
query_9 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_q6 = pd.read_sql(query_9, conn)
df_q6
/tmp/ipykernel_2257/2514142902.py:7: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 09:45:35+00:00 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 11:05:13+00:00 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 14:48:30+00:00 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 10:10:18+00:00 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 13:57:51+00:00 199.90 18.14
In [ ]:
conn.rollback()
query_10 = """
SELECT c.customer_state AS state, COUNT(DISTINCT o.order_id) AS total_orders, SUM(oi.price) AS revenue
FROM "e-commerce-project".olist_customers_dataset AS c
JOIN "e-commerce-project".olist_orders_dataset AS o
  ON c.customer_id = o.customer_id
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON oi.order_id = o.order_id
GROUP BY customer_state
ORDER BY revenue DESC
"""
df_q7 = pd.read_sql(query_10, conn)
df_q7
/tmp/ipykernel_2257/363883029.py:12: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
state total_orders revenue
0 SP 41375 5202955.05
1 RJ 12762 1824092.67
2 MG 11544 1585308.03
3 RS 5432 750304.02
4 PR 4998 683083.76
5 SC 3612 520553.34
6 BA 3358 511349.99
7 DF 2125 302603.94
8 GO 2007 294591.95
9 ES 2025 275037.31
10 PE 1648 262788.03
11 CE 1327 227254.71
12 PA 970 178947.81
13 MT 903 156453.53
14 MA 740 119648.22
15 MS 709 116812.64
16 PB 532 115268.08
17 PI 493 86914.08
18 RN 482 83034.98
19 AL 411 80314.81
20 SE 345 58920.85
21 TO 279 49621.74
22 RO 247 46140.64
23 AM 147 22356.84
24 AC 81 15982.95
25 AP 68 13474.30
26 RR 46 7829.43
In [ ]:
import json
import requests

url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson"
geojson = requests.get(url).json()

fig = px.choropleth(
df_q7,
geojson=geojson,
locations="state",
featureidkey="properties.sigla", # matches SP, RJ, MG etc
color="revenue",
hover_name="state",
hover_data=["total_orders", "revenue"],
title="Revenue by Brazilian State",
color_continuous_scale="Blues"
)

fig.update_geos(
fitbounds="locations",
visible=False
)

fig.show()

The Absolute Dominance of São Paulo

  • São Paulo is not just the leader; it is the economic engine of the platform. With $5,202,955 in revenue, it generates nearly 3 times more than the second-place state (Rio de Janeiro).
  • This dominance is likely due to better logistics, more concentrated seller hubs, and a higher density of e-commerce users.

Southeast Hub Centralization

  • The Top 3 states (SP, RJ, MG) all belong to the Southeast Region.
  • These three states combined represent the vast majority of the company's total cash flow. Business efforts are highly centralized in the industrial and financial heart of Brazil.

The Logistics Gap (North vs. South)

  • States like RR (Roraima) and AP (Amapá) show the lowest activity, with fewer than 100 orders each
  • The sharp decline in revenue as we move away from the South/Southeast suggests that high shipping costs (freight) and long delivery times are significant barriers for customers in remote regions.

Business Recommendations¶

Fulfillment Centers: Consider placing more "Fulfillment by Olist" centers in SP and MG to maintain dominance and reduce delivery times even further.

Subsidized Shipping: To grow the market in the North/Northeast, a strategy involving subsidized freight or regional distribution centers might be necessary to convert potential customers who are currently deterred by shipping prices.

Targeted Marketing: Focus high-conversion ad campaigns in RJ and MG, as they show high volume but still have significant room to grow compared to SP.

Who are the top-performing sellers?¶

In [ ]:
conn.rollback()

query_11 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 10
"""

df_q8 = pd.read_sql(query_11, conn)
df_q8
/tmp/ipykernel_2257/839127881.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 09:45:35+00:00 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 11:05:13+00:00 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 14:48:30+00:00 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 10:10:18+00:00 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 13:57:51+00:00 199.90 18.14
5 00048cc3ae777c65dbb7d2a0634bc1ea 1 ef92defde845ab8450f9d70c526ef70f 6426d21aca402a131fc0a5d0960a3c90 03:55:27+00:00 21.90 12.69
6 00054e8431b9d7675808bcb819fb4a32 1 8d4f2bb7e93e6710a28f34fa83ee7d28 7040e82f899a04d1b434b795a43b4617 12:10:31+00:00 19.90 11.85
7 000576fe39319847cbb9d288c5617fa6 1 557d850972a7d6f792fd18ae1400d9b6 5996cddab893a4652a15592fb58ab8db 12:30:45+00:00 810.00 70.75
8 0005a1a1728c9d785b8e2b08b904576c 1 310ae3c140ff94b03219ad0adc3c778f a416b6a846a11724393025641d4edd5e 18:31:29+00:00 145.95 11.65
9 0005f50442cb953dcd1d21e1fb923495 1 4535b0e1091c278dfd193e5a1d63b39f ba143b05f0110f0dc71ad71b4466ce92 14:10:56+00:00 53.99 11.40
In [ ]:
conn.rollback()

query_12 = """
SELECT *
FROM "e-commerce-project".olist_sellers_dataset
LIMIT 10
"""

df_q9 = pd.read_sql(query_12, conn)
df_q9
/tmp/ipykernel_2257/4099490292.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
seller_id seller_zip_code_prefix seller_city seller_state
0 3442f8959a84dea7ee197c632cb2df15 13023 campinas SP
1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 mogi guacu SP
2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 rio de janeiro RJ
3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 sao paulo SP
4 51a04a8a6bdcb23deccc82b0b80742cf 12914 braganca paulista SP
5 c240c4061717ac1806ae6ee72be3533b 20920 rio de janeiro RJ
6 e49c26c3edfa46d227d5121a6b6e4d37 55325 brejao PE
7 1b938a7ec6ac5061a66a3766e0e75f90 16304 penapolis SP
8 768a86e36ad6aae3d03ee3c6433d61df 1529 sao paulo SP
9 ccc4bbb5f32a6ab2b7066a4130f114e3 80310 curitiba PR
In [ ]:
conn.rollback()

query_13 = """
SELECT DISTINCT s.seller_id AS seller, sum(oi.price) AS revenue
FROM "e-commerce-project".olist_order_items_dataset AS oi
JOIN "e-commerce-project".olist_sellers_dataset AS S
  ON oi.seller_id = s.seller_id
GROUP BY seller
ORDER BY revenue DESC
LIMIT 10
"""

df_q10 = pd.read_sql(query_13, conn)
df_q10
/tmp/ipykernel_2257/1905358826.py:13: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
seller revenue
0 4869f7a5dfa277a7dca6462dcf3b52b2 229472.63
1 53243585a1d6dc2643021fd1853d8905 222776.05
2 4a3ca9315b744ce9f8e9374361493884 200472.92
3 fa1c13f2614d7b5c4749cbc52fecda94 194042.03
4 7c67e1448b00f6e969d365cea6b010ab 187923.89
5 7e93a43ef30c4f03f38b393420bc753a 176431.87
6 da8622b14eb17ae2831f4ac5b9dab84a 160236.57
7 7a67c85e85bb2ce8582c35f2203ad736 141745.53
8 1025f0e2d44d7041d6cf58b6550e0bfa 138968.55
9 955fee9216a65b617aa5c0531780ce60 135171.70
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="seller", data=df_q10, palette="viridis")
plt.title("Top 10 Sellers by Revenue")
plt.xlabel("Total Revenue")
plt.ylabel("Seller ID")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/1324732387.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image
In [ ]:
conn.rollback()

query_14 = """
SELECT DISTINCT s.seller_id AS seller, COUNT(oi.order_id) AS total_orders
FROM "e-commerce-project".olist_order_items_dataset AS oi
JOIN "e-commerce-project".olist_sellers_dataset AS S
  ON oi.seller_id = s.seller_id
GROUP BY seller
ORDER BY total_orders DESC
LIMIT 10
"""

df_q11 = pd.read_sql(query_14, conn)
df_q11
/tmp/ipykernel_2257/2051337999.py:13: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
seller total_orders
0 6560211a19b47992c3666cc44a7e94c0 2033
1 4a3ca9315b744ce9f8e9374361493884 1987
2 1f50f920176fa81dab994f9023523100 1931
3 cc419e0650a3c5ba77189a1882b7556a 1775
4 da8622b14eb17ae2831f4ac5b9dab84a 1551
5 955fee9216a65b617aa5c0531780ce60 1499
6 1025f0e2d44d7041d6cf58b6550e0bfa 1428
7 7c67e1448b00f6e969d365cea6b010ab 1364
8 ea8482cd71df3c1969d7b9473ff13abc 1203
9 7a67c85e85bb2ce8582c35f2203ad736 1171
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="total_orders", y="seller", data=df_q11, palette="magma")
plt.title("Top 10 Sellers by Total Orders")
plt.xlabel("Total Orders")
plt.ylabel("Seller ID")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/2652762271.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image

The "Whale" Sellers (High Revenue, High Volume)

  • Seller 4a3ca9315b744ce9f8e9374361493884 is a top performer in both metrics: #2 in total orders (1,987) and #3 in revenue ($194,004).
  • This seller is a cornerstone of the platform, maintaining high sales velocity while selling items with a healthy profit margin.

The "High-Frequency" Specialist

  • Seller 6560211a19b47992c3666cc44a7e94c0 leads the platform in total orders with 2,033 sales. However, they do not appear as the #1 leader in revenue.
  • This seller likely focuses on low-cost, high-turnover items (e.g., small accessories or household essentials). They drive massive traffic and logistics activity but with lower individual transaction values.

The "Premium" Specialist

  • Seller 4869f7a5dfa277a7dca6462dcf3b52b2 is the #1 Revenue Leader ($229,472) but does not have the highest number of orders.
  • This seller likely deals in high-ticket items (electronics, luxury watches, or furniture). They generate the most cash flow for the platform with fewer individual shipments.

Business Recommendations¶

Incentivize the "Whale" Sellers: Ensure sellers like 4a3ca9315b744ce9f8e9374361493884 have dedicated support, as they are the most valuable to the platform's stability.

Efficiency for High Volume: For sellers like 6560211a19b47992c3666cc44a7e94c0, the focus should be on logistics optimization. Since they ship thousands of items, even a small reduction in shipping cost or packaging time significantly increases their margin.

Risk Assessment: The marketplace is relatively healthy because the top revenue and top volume are spread across different sellers. There isn't a single seller that "owns" the entire platform, which reduces dependency risk.

What payment methods are most popular?¶

In [ ]:
conn.rollback()

query_15 = """
SELECT *
FROM "e-commerce-project".olist_order_payments_dataset
LIMIT 5
"""

df_q12 = pd.read_sql(query_15, conn)
df_q12
/tmp/ipykernel_2257/2582401646.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id payment_sequential payment_type payment_installments payment_value
0 b81ef226f3fe1789b1e8b2acac839d17 1 credit_card 8 99.33
1 a9810da82917af2d9aefd1278f1dcfa0 1 credit_card 1 24.39
2 25e8ea4e93396b6fa0d3dd708e76c1bd 1 credit_card 1 65.71
3 ba78997921bbcdc1373bb41e913ab953 1 credit_card 8 107.78
4 42fdf880ba16b47b59251dd489d4441a 1 credit_card 2 128.45
In [ ]:
conn.rollback()

query_16 = """
SELECT payment_type, COUNT(order_id) AS count, SUM(payment_value) AS revenue
FROM "e-commerce-project".olist_order_payments_dataset
GROUP BY payment_type
ORDER BY count DESC
LIMIT 4
"""

df_q13 = pd.read_sql(query_16, conn)
df_q13
/tmp/ipykernel_2257/787904435.py:11: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
payment_type count revenue
0 credit_card 76795 12542084.19
1 boleto 19784 2869361.27
2 voucher 5775 379436.87
3 debit_card 1529 217989.79
In [ ]:
plt.pie(df_q13["count"], labels=df_q13["payment_type"], autopct='%1.1f%%')
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="payment_type", data=df_q13, palette="magma")
plt.title("Revenue By Payment Type")
plt.xlabel("Revenue")
plt.ylabel("Payment Type")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/1527151366.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image

Credit Card Dominance

  • Credit cards are the undisputed primary payment method, accounting for the vast majority of transactions (76,795 orders) and revenue ($12,542,084.19).
  • Since credit cards allow for installments (as seen in your df_q11 with up to 8 installments), they are likely the main driver for high-ticket purchases. The platform’s revenue is highly dependent on credit card processing.

The "Boleto" Factor

  • The Boleto (Brazilian bank slip) is the second most popular method with 19,784 orders.
  • Boleto is a cash-based payment. This indicates a significant portion of the customer base either prefers not to use credit or does not have access to a credit limit, making this method essential for financial inclusion in the Brazilian market.

Vouchers and Retention

  • There were 5,775 transactions using vouchers
  • Vouchers are often linked to refunds, loyalty rewards, or promotional discounts. Their presence suggests an active customer retention or compensation system within the Olist ecosystem.

Debit Card Underutilization

  • Debit cards represent the smallest share (1,529 orders)
  • In Brazilian e-commerce, debit cards historically had lower adoption due to security friction and the lack of installment options compared to credit cards.

Business Recommendations¶

Installment Promotions: Since Credit Card is the leader, offering "interest-free" installments for specific high-value categories (like Watches or Electronics) could further boost the average ticket.

Boleto Incentives: Consider offering a small discount (e.g., 5%) for Boleto or Debit Card payments. This reduces the transaction fees paid to credit card processors and increases immediate cash flow.

Voucher Strategy: Analyze the source of the vouchers. If they are mostly from "Returns," it might indicate a quality issue in certain categories; if they are "Promotional," they are successfully driving repeat purchases.

How long does delivery usually take?¶

In [ ]:
conn.rollback()

query_17 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""

df_q14 = pd.read_sql(query_17, conn)
df_q14
/tmp/ipykernel_2257/3532966613.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id customer_id order_status order_purchase_timestamp order_approved_atorder_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26
In [ ]:
conn.rollback()

query_18 = """
SELECT order_id, order_delivered_customer_date, order_purchase_timestamp, order_delivered_customer_date - order_purchase_timestamp AS delivery_time
FROM "e-commerce-project".olist_orders_dataset
WHERE order_delivered_customer_date IS NOT NULL
ORDER BY delivery_time DESC
"""

df_q15 = pd.read_sql(query_18, conn)
df_q15
/tmp/ipykernel_2257/2167054448.py:10: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_delivered_customer_date order_purchase_timestamp delivery_time
0 ca07593549f1816d26a572e06dc1eab6 2017-09-19 14:36:39 2017-02-21 23:31:27 209 days 15:05:12
1 1b3190b2dfa9d789e1f14c05b647a14a 2018-09-19 23:24:07 2018-02-23 14:57:35 208 days 08:26:32
2 440d0d17af552815d15a9e41abe49359 2017-09-19 15:12:50 2017-03-07 23:59:51 195 days 15:12:59
3 2fb597c2f772eca01b1f5c561bf6cc7b 2017-09-19 14:33:17 2017-03-08 18:09:02 194 days 20:24:15
4 285ab9426d6982034523a855f55a885e 2017-09-19 14:00:04 2017-03-08 22:47:40 194 days 15:12:24
... ... ... ... ...
96471 bb5a519e352b45b714192a02ffe25681 2017-06-01 08:34:36 2017-05-31 11:11:55 0 days 21:22:41
96472 8339b608be0d84fca9d8da68b58332c3 2018-06-27 17:31:53 2018-06-26 20:48:33 0 days 20:43:20
96473 f3c6775ba3d2d9fe2826f93b71f12008 2017-07-05 08:09:26 2017-07-04 11:37:47 0 days 20:31:39
96474 434cecee7d1a65fc65358a632b6f725f 2017-05-30 08:06:56 2017-05-29 13:21:46 0 days 18:45:10
96475 1d893dd7ca5f77ebf5f59f0d2017eee0 2017-06-19 21:07:52 2017-06-19 08:19:45 0 days 12:48:07

96476 rows × 4 columns

In [ ]:
df_q15["delivery_time"] = df_q15["delivery_time"].dt.days
In [ ]:
df_q15.head()
Out[ ]:
order_id order_delivered_customer_date order_purchase_timestamp delivery_time
0 ca07593549f1816d26a572e06dc1eab6 2017-09-19 14:36:39 2017-02-21 23:31:27 209
1 1b3190b2dfa9d789e1f14c05b647a14a 2018-09-19 23:24:07 2018-02-23 14:57:35 208
2 440d0d17af552815d15a9e41abe49359 2017-09-19 15:12:50 2017-03-07 23:59:51 195
3 2fb597c2f772eca01b1f5c561bf6cc7b 2017-09-19 14:33:17 2017-03-08 18:09:02 194
4 285ab9426d6982034523a855f55a885e 2017-09-19 14:00:04 2017-03-08 22:47:40 194
In [ ]:
sns.histplot(df_q15["delivery_time"], bins=30, kde=True)
plt.show()
No description has been provided for this image
In [ ]:
conn.rollback()

query_19 = """
SELECT c.customer_state AS states,
 AVG(o.order_delivered_customer_date - o.order_purchase_timestamp) AS avg_delivery_time
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_customers_dataset AS c
  ON o.customer_id = c.customer_id
WHERE o.order_delivered_customer_date IS NOT NULL
GROUP BY states
ORDER BY avg_delivery_time DESC
"""

df_q16 = pd.read_sql(query_19, conn)
df_q16
/tmp/ipykernel_2257/1454990310.py:14: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
states avg_delivery_time
0 RR 29 days 09:18:03.975610
1 AP 27 days 04:26:29.850746
2 AM 26 days 10:13:25.613793
3 AL 24 days 13:03:09.103275
4 PA 23 days 18:33:00.021142
5 MA 21 days 13:45:05.167364
6 SE 21 days 12:28:29.707463
7 CE 21 days 06:23:52.394058
8 AC 21 days 00:51:25.600000
9 PB 20 days 10:14:32.721470
10 PI 19 days 10:58:13.289916
11 RO 19 days 08:55:43.954732
12 BA 19 days 08:03:04.247236
13 RN 19 days 06:40:32.696203
14 PE 18 days 10:45:35.106717
15 MT 18 days 01:20:17.319413
16 TO 17 days 15:47:36.609490
17 ES 15 days 18:56:36.134336
18 MS 15 days 14:50:22.763195
19 GO 15 days 14:33:07.649464
20 RJ 15 days 07:26:28.563264
21 RS 15 days 07:12:23.863024
22 SC 14 days 23:01:23.299972
23 DF 12 days 23:13:17.884615
24 MG 12 days 00:14:46.320827
25 PR 11 days 23:47:52.704448
26 SP 8 days 18:16:21.207112
In [ ]:
df_q16["avg_delivery_time"] = df_q16["avg_delivery_time"].dt.days
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="states", y="avg_delivery_time", data=df_q16, palette="magma")
plt.title("AVG Delivery Time Per States")
plt.xlabel("States")
plt.ylabel("AVG Delivery Time")
plt.show()
/tmp/ipykernel_2257/3687523790.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image

The "São Paulo Effect"

  • São Paulo has the fastest average delivery time at only 8 days.
  • This efficiency confirms that SP is the logistics hub of the platform. Proximity to sellers and high-density transport routes allow for "prime-like" speeds, which explains why SP also has the highest volume of orders.

The Geographic Penalty

  • There is a massive disparity in delivery times. While SP takes 8 days, states like RR (Roraima) and AP (Amapá) take nearly 30 days.
  • Customers in the North and Northeast regions experience a service level that is 3x to 4x slower than those in the Southeast. This is a significant pain point for market expansion in those regions.

Delivery Time Distribution

  • Histogram shows a "Long Tail." Most orders are delivered within 10–15 days, but there are extreme outliers taking over 200 days
  • These 200-day outliers are "Customer Service Nightmares." Even if they are few, they heavily damage the brand's reputation and lead to chargebacks or negative reviews.

Business Recommendations¶

Regional Distribution Centers: To reduce the 29-day wait in RR and AP, Olist should consider regional warehouses (Fulfillment centers) in the North/Northeast.

Manage Expectations: The platform should provide dynamic "Estimated Delivery Dates" based on the state. A customer in RR will be less frustrated if they are told "30 days" upfront than if they expect "standard" shipping.

Outlier Investigation: Audit the orders that took 200+ days. Were they lost in the mail? Were the sellers out of stock? Solving these "edge cases" will significantly improve the overall average.

Are late deliveries affecting customer reviews?¶

In [ ]:
conn.rollback()

query_20 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5

"""

df_q17 = pd.read_sql(query_20, conn)
df_q17
/tmp/ipykernel_2257/2481437227.py:10: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id customer_id order_status order_purchase_timestamp order_approved_atorder_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26
In [ ]:
conn.rollback()

query_21 = """
SELECT *
FROM "e-commerce-project".olist_order_reviews_dataset
LIMIT 5

"""

df_q18 = pd.read_sql(query_21, conn)
df_q18
/tmp/ipykernel_2257/1178195664.py:10: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
review_id order_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4 None None 00:00:00+00:00 21:46:59+00:00
1 80e641a11e56f04c1ad469d5645fdfde a548910a1c6147796b98fdf73dbeba33 5 None None 00:00:00+00:00 03:05:13+00:00
2 228ce5500dc1d8e020d8d1322874b6f0 f9e4b658b201a9f2ecdecbb34bed034b 5 None None 00:00:00+00:00 14:36:24+00:00
3 e64fb393e7b32834bb789ff8bb30750e 658677c97b385a9be170737859d3511b 5 None Recebi bem antes do prazo estipulado. 00:00:00+00:00 22:02:06+00:00
4 f7c4243c7fe1938f181bec41a392bdeb 8e6bfb81e283fa7e4f11123a3fb894f1 5 None Parabéns lojas lannister adorei comprar pela I... 00:00:00+00:00 10:26:53+00:00
In [ ]:
conn.rollback()

query_22 = """
SELECT o.order_delivered_customer_date, o.order_estimated_delivery_date,
CASE
  WHEN o.order_delivered_customer_date > o.order_estimated_delivery_date
  THEN 'delay'
  ELSE 'on_time'
END AS delay,
review_score
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_order_reviews_dataset AS r
  ON o.order_id = r.order_id
WHERE o.order_delivered_customer_date IS NOT NULL

"""

df_q19 = pd.read_sql(query_22, conn)
df_q19
/tmp/ipykernel_2257/2355831121.py:18: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_delivered_customer_date order_estimated_delivery_date delay review_score
0 2017-10-10 21:25:13 2017-10-18 on_time 4
1 2018-08-07 15:27:45 2018-08-13 on_time 4
2 2018-08-17 18:06:29 2018-09-04 on_time 5
3 2017-12-02 00:28:42 2017-12-15 on_time 5
4 2018-02-16 18:17:02 2018-02-26 on_time 5
... ... ... ... ...
96354 2017-03-17 15:08:01 2017-03-28 on_time 5
96355 2018-02-28 17:37:56 2018-03-02 on_time 4
96356 2017-09-21 11:24:17 2017-09-27 on_time 5
96357 2018-01-25 23:32:54 2018-02-15 on_time 2
96358 2018-03-16 13:08:30 2018-04-03 on_time 5

96359 rows × 4 columns

In [ ]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='delay', y='review_score', data=df_q19, palette='Set2')

plt.title('Distribution of Review Scores: On-Time vs. Delayed')
plt.xlabel('Delivery Status')
plt.ylabel('Review Score (1-5)')
plt.show()
/tmp/ipykernel_2257/4074529510.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image
In [ ]:
avg_scores = df_q19.groupby("delay")["review_score"].mean()
avg_scores
Out[ ]:
review_score
delay
delay 2.566550
on_time 4.293578

In [ ]:
plt.figure(figsize=(8, 6))
sns.barplot(x='delay', y='review_score', data=avg_scores.reset_index(), palette='viridis')

plt.title('Average Review Score by Delivery Status')
plt.xlabel('Delivery Status')
plt.ylabel('Average Review Score')
plt.ylim(0, 5)
plt.show()
/tmp/ipykernel_2257/169006175.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image

The "Review Penalty" of Delays

  • There is a massive drop in satisfaction when an order is delayed. The Average Review Score drops from 4.29 (on-time) to 2.57 (delayed).
  • A delay costs the platform nearly 1.7 points in satisfaction per order. This is the difference between a "highly recommended" service and a "poor" service.

Punctuality as a Minimum Standard

  • When orders are on-time, the review distribution (Boxplot) shows a median of 5.0.
  • For the Olist customer, receiving the package on or before the estimated date is the primary driver of a 5-star experience. Anything less than on-time delivery immediately jeopardizes the rating

High Variance in Delayed Orders

  • The boxplot for "delayed" orders is much larger, stretching from 1 to 4 stars.
  • This indicates that some customers are "forgiving" of short delays, but a significant portion immediately gives a 1-star rating. This volatility makes delayed orders highly unpredictable for brand reputation.

Business Recommendations¶

Buffer Time Strategy: Since "on-time" status is so critical, the platform could slightly increase the "Estimated Delivery Date" shown to the customer. It is better to "under-promise and over-deliver" than to miss a tight deadline.

Proactive Compensation: For orders identified as "delayed" in the system, Olist could automatically send a discount voucher before the customer leaves a review. This could "rescue" the review score from a 1 to a 3 or 4.

Prioritize High-Value Delays: Using your previous seller analysis, ensure that "Premium" (high-ticket) items never go into "delay" status, as a bad review on an expensive item carries more weight in the marketplace.

Which product categories receive the best and worst reviews?¶

In [ ]:
conn.rollback()

query_23 = """
SELECT *
FROM "e-commerce-project".olist_order_reviews_dataset
LIMIT 5
"""


df_q20 = pd.read_sql(query_23, conn)
df_q20
/tmp/ipykernel_2257/2456288014.py:10: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
review_id order_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4 None None 00:00:00+00:00 21:46:59+00:00
1 80e641a11e56f04c1ad469d5645fdfde a548910a1c6147796b98fdf73dbeba33 5 None None 00:00:00+00:00 03:05:13+00:00
2 228ce5500dc1d8e020d8d1322874b6f0 f9e4b658b201a9f2ecdecbb34bed034b 5 None None 00:00:00+00:00 14:36:24+00:00
3 e64fb393e7b32834bb789ff8bb30750e 658677c97b385a9be170737859d3511b 5 None Recebi bem antes do prazo estipulado. 00:00:00+00:00 22:02:06+00:00
4 f7c4243c7fe1938f181bec41a392bdeb 8e6bfb81e283fa7e4f11123a3fb894f1 5 None Parabéns lojas lannister adorei comprar pela I... 00:00:00+00:00 10:26:53+00:00
In [ ]:
conn.rollback()

query_24 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""

df_q21 = pd.read_sql(query_24, conn)
df_q21
/tmp/ipykernel_2257/563161855.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 09:45:35+00:00 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 11:05:13+00:00 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 14:48:30+00:00 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 10:10:18+00:00 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 13:57:51+00:00 199.90 18.14
In [ ]:
conn.rollback()

query_25 = """
SELECT *
FROM "e-commerce-project".olist_products_dataset
LIMIT 5
"""

df_q22 = pd.read_sql(query_25, conn)
df_q22
/tmp/ipykernel_2257/1145064856.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40 287 1 225 16 10 14
1 3aa071139cb16b67ca9e5dea641aaa2f artes 44 276 1 1000 30 18 20
2 96bd76ec8810374ed1b65e291975717f esporte_lazer 46 250 1 154 18 9 15
3 cef67bcfe19066a932b7673e239eb23d bebes 27 261 1 371 26 4 26
4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas 37 402 4 625 20 17 13
In [ ]:
conn.rollback()

query_26 = """
SELECT *
FROM "e-commerce-project".product_category_name_translation
LIMIT 5
"""

df_q23 = pd.read_sql(query_26, conn)
df_q23
/tmp/ipykernel_2257/184134894.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name product_category_name_english
0 beleza_saude health_beauty
1 informatica_acessorios computers_accessories
2 automotivo auto
3 cama_mesa_banho bed_bath_table
4 moveis_decoracao furniture_decor
In [ ]:
conn.rollback()

query_27 = """
SELECT t.product_category_name_english, AVG(r.review_score) AS reviews
FROM "e-commerce-project".olist_order_reviews_dataset AS r
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON r.order_id = oi.order_id
JOIN "e-commerce-project".olist_products_dataset AS p
  ON oi.product_id = p.product_id
JOIN "e-commerce-project".product_category_name_translation AS t
  ON p.product_category_name = t.product_category_name
WHERE r.review_score IS NOT NULL
GROUP BY t.product_category_name_english
ORDER BY reviews
LIMIT 5
"""

df_q24 = pd.read_sql(query_27, conn)
df_q24
/tmp/ipykernel_2257/2711359845.py:18: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name_english reviews
0 security_and_services 2.500000
1 diapers_and_hygiene 3.256410
2 office_furniture 3.493183
3 home_comfort_2 3.629630
4 fashion_male_clothing 3.641221
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="reviews", y="product_category_name_english", hue="product_category_name_english", data=df_q24, palette="Reds_d", legend=False)
plt.title("Top 5 Product Categories with Worst Reviews")
plt.xlabel("Average Review Score")
plt.ylabel("Product Category (English)")
plt.xlim(1, 5) # Set x-axis limit from 1 to 5 for review scores
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(8, 6))
sns.heatmap(df_q24.set_index('product_category_name_english')[['reviews']],
            annot=True, cmap='Reds_r', fmt=".2f", linewidths=.5, cbar_kws={'label': 'Average Review Score'})
plt.title('Heatmap of Top 5 Product Categories with Worst Reviews')
plt.ylabel('Product Category (English)')
plt.xlabel('')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
conn.rollback()

query_28 = """
SELECT t.product_category_name_english, AVG(r.review_score) AS reviews
FROM "e-commerce-project".olist_order_reviews_dataset AS r
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON r.order_id = oi.order_id
JOIN "e-commerce-project".olist_products_dataset AS p
  ON oi.product_id = p.product_id
JOIN "e-commerce-project".product_category_name_translation AS t
  ON p.product_category_name = t.product_category_name
WHERE r.review_score IS NOT NULL
GROUP BY t.product_category_name_english
ORDER BY reviews DESC
LIMIT 5
"""

df_q25 = pd.read_sql(query_28, conn)
df_q25
/tmp/ipykernel_2257/2301364224.py:18: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_category_name_english reviews
0 cds_dvds_musicals 4.642857
1 fashion_childrens_clothes 4.500000
2 books_general_interest 4.446266
3 costruction_tools_tools 4.444444
4 flowers 4.419355
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="reviews", y="product_category_name_english", hue="product_category_name_english", data=df_q25, palette="Greens_d", legend=False)
plt.title("Top 5 Product Categories with Best Reviews")
plt.xlabel("Average Review Score")
plt.ylabel("Product Category (English)")
plt.xlim(1, 5) # Set x-axis limit from 1 to 5 for review scores
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(8, 6))
sns.heatmap(df_q25.set_index('product_category_name_english')[['reviews']],
            annot=True, cmap='Greens', fmt=".2f", linewidths=.5, cbar_kws={'label': 'Average Review Score'})
plt.title('Heatmap of Top 5 Product Categories with Best Reviews')
plt.ylabel('Product Category (English)')
plt.xlabel('')
plt.tight_layout()
plt.show()
No description has been provided for this image

The Red Zone: Critical Categories

  • security_and_services holds the lowest average score at 2.50.
  • Categories like diapers_and_hygiene and office_furniture are also struggling (scores below 3.5). These are often "high-expectation" categories where any delay or quality defect leads to immediate dissatisfaction.
  • For hygiene and office products, the platform might be facing issues with bulky shipping (leading to damage) or reliability of service providers.

The Green Zone: Excellence in Niche

  • cds_dvds_musicals leads with a stellar 4.64 average.
  • High-scoring categories like fashion_childrens_clothes and books_general_interest (all > 4.4) share a common trait: they are generally easy to ship, standardized, and less prone to mechanical failure.
  • These categories represent the "safe bets" for the platform. They drive positive sentiment and organic growth through word-of-mouth.

Business Recommendations¶

Quality Audit for Security: The score of 2.5 is a red flag. Olist should investigate if a specific seller is dragging down the average or if the category has inherent flaws (e.g., poor instructions or technical failure).

Logistics for Furniture: Since office_furniture is in the bottom 5, it correlates with our previous finding that bulky items suffer more delays. Improving the specialized "heavy-load" delivery network could raise these scores.

Cross-Sell from the Best: Use the high-satisfaction categories (like books or flowers) as "entry points" for new customers. A great first experience with a 4.6-star category builds trust before the customer tries a more complex one.

What is the average order value (AOV)?¶

In [ ]:
conn.rollback()

query_29 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""

df_q26 = pd.read_sql(query_29, conn)
df_q26
/tmp/ipykernel_2257/2055158174.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 09:45:35+00:00 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 11:05:13+00:00 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 14:48:30+00:00 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 10:10:18+00:00 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 13:57:51+00:00 199.90 18.14
In [ ]:
conn.rollback()

query_30 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""

df_q27 = pd.read_sql(query_30, conn)
df_q27
/tmp/ipykernel_2257/1789558180.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id customer_id order_status order_purchase_timestamp order_approved_atorder_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26
In [ ]:
conn.rollback()

query_31 = """
SELECT SUM(oi.price) AS total_revenue, COUNT(o.order_id) AS total_orders, SUM(oi.price)/COUNT(o.order_id) AS aov
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON o.order_id = oi.order_id
"""

df_q28 = pd.read_sql(query_31, conn)
df_q28
/tmp/ipykernel_2257/112653745.py:10: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
total_revenue total_orders aov
0 13591643.7 112650 120.653739
In [ ]:
conn.rollback()

query_32 = """
SELECT TO_CHAR(o.order_purchase_timestamp, 'YYYY-MM') AS monthly, SUM(oi.price) AS total_revenue, COUNT(o.order_id) AS total_orders, SUM(oi.price)/COUNT(o.order_id) AS aov
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON o.order_id = oi.order_id
GROUP BY monthly
ORDER BY monthly
"""

df_q29 = pd.read_sql(query_32, conn)
df_q29
/tmp/ipykernel_2257/371705559.py:12: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
monthly total_revenue total_orders aov
0 2016-09 267.36 6 44.560000
1 2016-10 49507.66 363 136.384738
2 2016-12 10.90 1 10.900000
3 2017-01 120312.87 955 125.982063
4 2017-02 247303.02 1951 126.757058
5 2017-03 374344.30 3000 124.781433
6 2017-04 359927.23 2684 134.101054
7 2017-05 506071.14 4136 122.357626
8 2017-06 433038.60 3583 120.859224
9 2017-07 498031.48 4519 110.208338
10 2017-08 573971.68 4910 116.898509
11 2017-09 624401.69 4831 129.248953
12 2017-10 664219.43 5322 124.806357
13 2017-11 1010271.37 8665 116.592195
14 2017-12 743914.17 6308 117.931860
15 2018-01 950030.36 8208 115.744440
16 2018-02 844178.71 7672 110.033721
17 2018-03 983213.44 8217 119.656011
18 2018-04 996647.75 7975 124.971505
19 2018-05 996517.68 7925 125.743556
20 2018-06 865124.31 7078 122.227227
21 2018-07 895507.22 7092 126.270054
22 2018-08 854686.33 7248 117.920299
23 2018-09 145.00 1 145.000000
In [ ]:
plt.figure(figsize=(14, 7))
sns.lineplot(x='monthly', y='aov', data=df_q29)
plt.title('Monthly Average Order Value (AOV) Trend')
plt.xlabel('Month')
plt.ylabel('Average Order Value')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
No description has been provided for this image

Consistent Exponential Growth

  • Total monthly revenue grew from a mere 267 in late 2016 to nearly 1 Million per month by late 2017 and 2018.
  • The platform experienced a massive scaling phase during 2017. The volume of orders increased proportionally, showing that Olist successfully acquired both new sellers and new customers at a high rate.

Stability of the Average Order Value (AOV)

  • Despite the massive increase in total orders, the AOV has remained remarkably stable, fluctuating mostly between $110 and $130.
  • This stability is a good sign; it means that as the platform grows, it isn't just attracting "cheap" low-value sales. It is maintaining its middle-market position. The peaks in the line chart likely represent periods where high-ticket items (like Electronics) had successful promotions.

Business Recommendations¶

AOV Growth Campaigns: Since AOV is stable but flat, Olist could implement "Upselling" strategies (e.g., "Buy 3, get 10% off") or Product Bundling to push the AOV from $120 closer to $150.

Infrastructure Readiness: Based on the November spike, the platform must ensure logistics and server capacity are doubled every October to prepare for the annual sales peak.

Low-Month Analysis: Investigate the slight dips in February or June to see if they are related to Brazilian holidays (like Carnival) and create specific "Off-Season" promotions to smooth out the revenue curve.

Which cities have the most customers?¶

In [ ]:
conn.rollback()

query_33 = """
SELECT *
FROM "e-commerce-project".olist_customers_dataset
LIMIT 5
"""

df_q30 = pd.read_sql(query_33, conn)
df_q30
/tmp/ipykernel_2257/2706480270.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
customer_id customer_unique_id customer_city customer_state customer_zip_code_prefix
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 franca SP 14409
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 sao bernardo do campo SP 09790
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e sao paulo SP 01151
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c mogi das cruzes SP 08775
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 campinas SP 13056
In [ ]:
conn.rollback()

query_33 = """
SELECT customer_state AS state, customer_city AS city, COUNT(customer_id) AS total_customer
FROM "e-commerce-project".olist_customers_dataset
GROUP BY state, city
ORDER BY total_customer DESC
"""

df_q31 = pd.read_sql(query_33, conn)
df_q31
/tmp/ipykernel_2257/1480094160.py:10: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
state city total_customer
0 SP sao paulo 15540
1 RJ rio de janeiro 6882
2 MG belo horizonte 2773
3 DF brasilia 2131
4 PR curitiba 1521
... ... ... ...
4305 ES santa leopoldina 1
4306 SP flora rica 1
4307 GO teresina de goias 1
4308 TO novo alegre 1
4309 MG frei lagonegro 1

4310 rows × 3 columns

In [ ]:
df_states = (
    df_q31
    .groupby("state", as_index=False)["total_customer"]
    .sum()
)
In [ ]:
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson"
geojson = requests.get(url).json()

fig = px.choropleth(
df_states,
geojson=geojson,
locations="state",
featureidkey="properties.sigla", # matches SP, RJ, MG etc
color="total_customer",
hover_name="state",
hover_data=["total_customer"],
title="Clients by Brazilian State",
color_continuous_scale="Reds"
)

fig.update_geos(
fitbounds="locations",
visible=False
)

fig.show()
In [ ]:
top_10_citys = df_q31.head(10)
In [ ]:
plt.figure(figsize=(12, 7))
sns.barplot(x="total_customer", y="city", hue="city", data=top_10_citys, palette="Reds", legend=False)
plt.title("Top 10 Citys With Most Customers")
plt.xlabel("Total Customers")
plt.ylabel("citys")
plt.show()
No description has been provided for this image

The "Heart" of the Market: São Paulo

  • São Paulo is the absolute leader with 15,540 customers, nearly triple that of the second-place city (Rio de Janeiro).
  • SP is not just a market; it is the platform's core ecosystem. The high density allows for lower shipping costs and the 8-day delivery average we observed earlier.

Urban Concentration

  • The top 10 cities are primarily state capitals in the Southeast and South.
  • While the platform is "national," the revenue is highly regionalized. States like Amapá (AP) and Roraima (RR) show very low customer density on the choropleth map, which correlates with the 30-day delivery times previously identified.

What factors influence product price and freight cost?¶

In [ ]:
conn.rollback()

query_34 = """
SELECT *
FROM "e-commerce-project".olist_products_dataset
LIMIT 5
"""

df_q32 = pd.read_sql(query_34, conn)
df_q32
/tmp/ipykernel_2257/348530697.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40 287 1 225 16 10 14
1 3aa071139cb16b67ca9e5dea641aaa2f artes 44 276 1 1000 30 18 20
2 96bd76ec8810374ed1b65e291975717f esporte_lazer 46 250 1 154 18 9 15
3 cef67bcfe19066a932b7673e239eb23d bebes 27 261 1 371 26 4 26
4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas 37 402 4 625 20 17 13
In [ ]:
conn.rollback()

query_35 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""

df_q33 = pd.read_sql(query_35, conn)
df_q33
/tmp/ipykernel_2257/712068128.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 09:45:35+00:00 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 11:05:13+00:00 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 14:48:30+00:00 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 10:10:18+00:00 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 13:57:51+00:00 199.90 18.14
In [ ]:
conn.rollback()

query_36 = """
SELECT p.product_weight_g, oi.freight_value
FROM "e-commerce-project".olist_products_dataset AS p
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON p.product_id = oi.product_id
WHERE P.product_weight_g IS NOT NULL OR oi.freight_value IS NOT NULL
"""

df_q34 = pd.read_sql(query_36, conn)
df_q34
/tmp/ipykernel_2257/3334282439.py:11: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_weight_g freight_value
0 650.0 13.29
1 30000.0 19.93
2 3050.0 17.87
3 200.0 12.79
4 3750.0 18.14
... ... ...
112645 10150.0 43.41
112646 8950.0 36.53
112647 967.0 16.95
112648 100.0 8.72
112649 600.0 12.79

112650 rows × 2 columns

In [ ]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='product_weight_g', y='freight_value', data=df_q34, alpha=0.6)
plt.title('Product Weight vs. Freight Value')
plt.xlabel('Product Weight (g)')
plt.ylabel('Freight Value')
plt.ylim(0, 150)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
correlation = df_q34['product_weight_g'].corr(df_q34['freight_value'])
print(f"Correlation between Product Weight and Freight Value: {correlation:.2f}")
Correlation between Product Weight and Freight Value: 0.61
In [ ]:
conn.rollback()

query_37 = """
SELECT p.product_length_cm * p.product_height_cm	* p.product_width_cm AS size, oi.freight_value
FROM "e-commerce-project".olist_products_dataset AS p
JOIN "e-commerce-project".olist_order_items_dataset AS oi
  ON p.product_id = oi.product_id
WHERE p.product_length_cm IS NOT NULL OR p.product_height_cm IS NOT NULL OR p.product_width_cm IS NOT NULL OR oi.freight_value IS NOT NULL
"""

df_q35 = pd.read_sql(query_37, conn)
df_q35
/tmp/ipykernel_2257/3112260560.py:11: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
size freight_value
0 3528.0 13.29
1 60000.0 19.93
2 14157.0 17.87
3 2400.0 12.79
4 42000.0 18.14
... ... ...
112645 53400.0 43.41
112646 44460.0 36.53
112647 9576.0 16.95
112648 8000.0 8.72
112649 1710.0 12.79

112650 rows × 2 columns

In [ ]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='size', y='freight_value', data=df_q35, alpha=0.6)
plt.title('Product Size vs. Freight Value')
plt.xlabel('Product Size')
plt.ylabel('Freight Value')
plt.ylim(0, 150)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
correlation = df_q35['size'].corr(df_q35['freight_value'])
print(f"Correlation between Product Size and Freight Value: {correlation:.2f}")
Correlation between Product Size and Freight Value: 0.59
In [ ]:
conn.rollback()

query_38 = """
SELECT price, freight_value
FROM "e-commerce-project".olist_order_items_dataset
WHERE price IS NOT NULL OR freight_value IS NOT NULL
"""

df_q36 = pd.read_sql(query_38, conn)
df_q36
/tmp/ipykernel_2257/30038573.py:9: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
price freight_value
0 58.90 13.29
1 239.90 19.93
2 199.00 17.87
3 12.99 12.79
4 199.90 18.14
... ... ...
112645 299.99 43.41
112646 350.00 36.53
112647 99.90 16.95
112648 55.99 8.72
112649 43.00 12.79

112650 rows × 2 columns

In [ ]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='price', y='freight_value', data=df_q36, alpha=0.6)
plt.title('Product Price vs. Freight Value')
plt.xlabel('Product Price')
plt.ylabel('Freight Value')
plt.ylim(0, 150)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
correlation = df_q36['price'].corr(df_q36['freight_value'])
print(f"Correlation between Product Price and Freight Value: {correlation:.2f}")
Correlation between Product Price and Freight Value: 0.41
In [ ]:
conn.rollback()

query_39 = """
SELECT
    p.product_weight_g, p.product_length_cm * p.product_height_cm * p.product_width_cm AS size, oi.freight_value
FROM "e-commerce-project".olist_products_dataset AS p
JOIN "e-commerce-project".olist_order_items_dataset AS oi
    ON p.product_id = oi.product_id
WHERE p.product_weight_g IS NOT NULL
    OR p.product_length_cm IS NOT NULL
    OR p.product_height_cm IS NOT NULL
    OR p.product_width_cm IS NOT NULL
    OR oi.freight_value IS NOT NULL
"""

df_q37 = pd.read_sql(query_39, conn)
df_q37
/tmp/ipykernel_2257/2218929922.py:16: UserWarning:

pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

Out[ ]:
product_weight_g size freight_value
0 650.0 3528.0 13.29
1 30000.0 60000.0 19.93
2 3050.0 14157.0 17.87
3 200.0 2400.0 12.79
4 3750.0 42000.0 18.14
... ... ... ...
112645 10150.0 53400.0 43.41
112646 8950.0 44460.0 36.53
112647 967.0 9576.0 16.95
112648 100.0 8000.0 8.72
112649 600.0 1710.0 12.79

112650 rows × 3 columns

In [ ]:
correlation = df_q37[['product_weight_g', 'size', 'freight_value']].corr()
print(correlation)
                  product_weight_g      size  freight_value
product_weight_g          1.000000  0.802267        0.61042
size                      0.802267  1.000000        0.58727
freight_value             0.610420  0.587270        1.00000

Relationship Between Product Weight and Freight Value¶

  1. Visual Trend: The scatter plot reveals a clear upward trend, where higher product weights generally correspond to higher freight values.
  2. Correlation Coefficient: The calculated correlation of 0.61 indicates a strong positive relationship.
  3. Observation: This strong correlation suggests that as the weight of a product increases, shipping costs rise significantly. This is likely driven by carrier pricing models that utilize weight-based brackets or volumetric weight calculations, making product weight a primary driver of logistics expenses.

Product Size (Volume) vs. Freight Value¶

1. Correlation Analysis

  • The correlation coefficient between Product Size (Volume) and Freight Value is 0.59.
  • This indicates a moderate positive relationship, confirming that as the physical dimensions of a product increase, the cost of shipping tends to rise.

2. Comparison with Product Weight

  • When compared to the correlation between Product Weight and Freight Value (0.61), volume is a slightly weaker driver of shipping costs in this dataset.
  • This suggests that while carrier space (volumetric weight) is a significant factor in pricing, the actual weight of the package remains the primary metric used by logistics partners for cost calculation.

3. Visual Observations

  • The scatter plot shows a high concentration of data points for smaller volumes, with a 'long tail' for larger items.
  • The variance increases for larger sizes, suggesting that for very bulky items, other factors (such as distance or specialized handling) might play a more dominant role than just the dimensions themselves.

Price vs. Freight Value Analysis¶

  1. Correlation Assessment: The correlation coefficient between product price and freight value is 0.41. This indicates a moderate positive correlation.

  2. Price as a Driver: While there is a visible upward trend, the 0.41 value suggests that price is a secondary driver of freight costs compared to weight (0.61) and size (0.59). Shipping costs are more heavily dictated by physical logistics than by the value of the item.

  3. The 'Insurance Effect':

    • Higher-priced items do show a slight increase in shipping fees, which likely accounts for insurance premiums or special handling fees for high-ticket goods.
    • However, the significant spread in the scatter plot shows many expensive items still have low freight costs (if they are small/light), while many cheap items have high freight costs (if they are bulky).

Detailed Business Strategies and Actionable Recommendations¶

1. Logistics Optimization Strategy: Heavy & Bulky Items¶

Weight (0.61) and Size (0.59) are the primary drivers of freight cost, significantly more so than Price (0.41).

  • Action: Negotiate specialized freight contracts for 'Heavy Load' categories (Furniture, Garden Tools). Moving away from general postal rates to dedicated LTL (Less-Than-Truckload) carriers for items over 10kg or specific volumetric dimensions can reduce costs by 15-20%.
  • Target: Office Furniture and Housewares, which currently suffer from high freight-to-price ratios.

2. 'Flat-Rate' Competitiveness for High-Value Items¶

High-ticket items like Watches and Health & Beauty have a lower correlation with freight costs relative to their value.

  • Action: Implement a Flat-Rate Shipping model for items weighing under 1kg with a value exceeding $200.
  • Impact: This increases the competitiveness of premium sellers and encourages customers to add more high-value, low-weight items to their carts without the friction of calculated shipping.

3. Mitigating the 'Geographic Penalty' via Regional Hubs¶

Northern states (RR, AP, AM) experience 3x-4x longer delivery times (up to 30 days) and higher costs compared to São Paulo (8 days).

  • Action: Establish Regional Fulfillment Centers in the Northeast (e.g., Fortaleza or Recife).
  • Impact: By pre-positioning top-selling products (identified in the 'Bed, Bath & Table' volume analysis) in these regions, Olist can reduce delivery times to under 10 days and slash long-haul freight costs, unlocking growth in currently underserved markets.

4. Packaging Efficiency & Volumetric Reduction¶

There is a high collinearity (0.80) between product size and weight.

  • Action: Launch a 'Sustainable Packaging Initiative' for sellers. Provide guidelines and subsidized eco-friendly, right-sized packaging to reduce 'dead air' in shipments.
  • Impact: Reducing the volumetric weight of bulky items directly impacts the 0.59 correlation factor of size-to-freight, immediately improving margins for both the platform and the sellers.

5. Buffer-Time & Review Score Protection¶

Late deliveries cause a massive drop in satisfaction (4.29 to 2.57 avg score).

  • Action: Implement a 'Dynamic Buffer' in the checkout process. For regions with high delivery variance (North/Northeast), add 3-5 days to the Estimated Delivery Date while simultaneously triggering proactive discount vouchers if a delay is detected in the carrier's API.
  • Impact: Under-promising and over-delivering will protect the platform's brand reputation and reduce 1-star reviews.