!pip install psycopg2-binary
import psycopg2
from google.colab import userdata
password_post = userdata.get('postpass')
conn = psycopg2.connect(
host="8.tcp.ngrok.io",
port=16178,
database="test_db",
user="postgres",
password=password_post
)
cursor = conn.cursor()
cursor.execute("SELECT version();")
print(cursor.fetchone())
Collecting psycopg2-binary Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (4.9 kB) Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.2/4.2 MB 27.2 MB/s eta 0:00:00 Installing collected packages: psycopg2-binary Successfully installed psycopg2-binary-2.9.11 ('PostgreSQL 18.1 on x86_64-windows, compiled by msvc-19.44.35221, 64-bit',)
conn.rollback()
cursor.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'e-commerce-project';
""")
print(cursor.fetchall())
[('olist_products_dataset',), ('olist_sellers_dataset',), ('product_category_name_translation',), ('olist_order_reviews_dataset',), ('olist_geolocation_dataset',), ('olist_order_items_dataset',), ('olist_order_payments_dataset',), ('olist_customers_dataset',), ('olist_orders_dataset',)]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install plotly
import plotly.express as px
Requirement already satisfied: plotly in /usr/local/lib/python3.12/dist-packages (5.24.1) Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.12/dist-packages (from plotly) (9.1.4) Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from plotly) (26.0)
What are the top-selling product categories?¶
query_1 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_items = pd.read_sql(query_1, conn)
df_items
/tmp/ipykernel_2257/425679890.py:6: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 09:45:35+00:00 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 11:05:13+00:00 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 14:48:30+00:00 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 10:10:18+00:00 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 13:57:51+00:00 | 199.90 | 18.14 |
query_2 = """
SELECT *
FROM "e-commerce-project".olist_products_dataset
LIMIT 5
"""
df_products = pd.read_sql(query_2, conn)
df_products
/tmp/ipykernel_2257/4179090135.py:6: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40 | 287 | 1 | 225 | 16 | 10 | 14 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44 | 276 | 1 | 1000 | 30 | 18 | 20 |
| 2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46 | 250 | 1 | 154 | 18 | 9 | 15 |
| 3 | cef67bcfe19066a932b7673e239eb23d | bebes | 27 | 261 | 1 | 371 | 26 | 4 | 26 |
| 4 | 9dc1a7de274444849c219cff195d0b71 | utilidades_domesticas | 37 | 402 | 4 | 625 | 20 | 17 | 13 |
query_3 = """
SELECT *
FROM "e-commerce-project".product_category_name_translation
LIMIT 5
"""
df_trans = pd.read_sql(query_3, conn)
df_trans
/tmp/ipykernel_2257/2897695490.py:6: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name | product_category_name_english | |
|---|---|---|
| 0 | beleza_saude | health_beauty |
| 1 | informatica_acessorios | computers_accessories |
| 2 | automotivo | auto |
| 3 | cama_mesa_banho | bed_bath_table |
| 4 | moveis_decoracao | furniture_decor |
conn.rollback()
query_4 = """
SELECT t.product_category_name_english, COUNT(order_id) AS total_orders
FROM "e-commerce-project".olist_order_items_dataset o
JOIN "e-commerce-project".olist_products_dataset p
ON o.product_id = p.product_id
JOIN "e-commerce-project".product_category_name_translation t
ON p.product_category_name = t.product_category_name
GROUP BY t.product_category_name_english
ORDER BY total_orders DESC
LIMIT 10
"""
df_q1 = pd.read_sql(query_4, conn)
df_q1
/tmp/ipykernel_2257/297876490.py:13: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name_english | total_orders | |
|---|---|---|
| 0 | bed_bath_table | 11115 |
| 1 | health_beauty | 9670 |
| 2 | sports_leisure | 8641 |
| 3 | furniture_decor | 8334 |
| 4 | computers_accessories | 7827 |
| 5 | housewares | 6964 |
| 6 | watches_gifts | 5991 |
| 7 | telephony | 4545 |
| 8 | garden_tools | 4347 |
| 9 | auto | 4235 |
plt.figure(figsize=(12, 7))
sns.barplot(x="total_orders", y="product_category_name_english", data=df_q1, palette="viridis")
plt.title("Top 10 Product Categories")
plt.xlabel("Total Orders")
plt.ylabel("Product category")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/2842852274.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
conn.rollback()
query_5 = """
SELECT t.product_category_name_english, SUM(o.price) AS revenue
FROM "e-commerce-project".olist_order_items_dataset AS o
INNER JOIN "e-commerce-project".olist_products_dataset AS p
ON o.product_id = p.product_id
INNER JOIN "e-commerce-project".product_category_name_translation AS t
ON p.product_category_name = t.product_category_name
GROUP BY t.product_category_name_english
ORDER BY revenue DESC
LIMIT 10
"""
df_q2 = pd.read_sql(query_5, conn)
df_q2
/tmp/ipykernel_2257/187717851.py:13: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name_english | revenue | |
|---|---|---|
| 0 | health_beauty | 1258681.34 |
| 1 | watches_gifts | 1205005.68 |
| 2 | bed_bath_table | 1036988.68 |
| 3 | sports_leisure | 988048.97 |
| 4 | computers_accessories | 911954.32 |
| 5 | furniture_decor | 729762.49 |
| 6 | cool_stuff | 635290.85 |
| 7 | housewares | 632248.66 |
| 8 | auto | 592720.11 |
| 9 | garden_tools | 485256.46 |
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="product_category_name_english", data=df_q2, palette="magma")
plt.title("Total revenue per category")
plt.xlabel("Revenue")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/2624280649.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
conn.rollback()
query_6 = """
SELECT t.product_category_name_english, SUM(o.price + o.freight_value) AS revenue
FROM "e-commerce-project".olist_order_items_dataset AS o
INNER JOIN "e-commerce-project".olist_products_dataset AS p
ON o.product_id = p.product_id
INNER JOIN "e-commerce-project".product_category_name_translation AS t
ON p.product_category_name = t.product_category_name
GROUP BY t.product_category_name_english
ORDER BY revenue DESC
LIMIT 10
"""
df_q3 = pd.read_sql(query_6, conn)
df_q3
/tmp/ipykernel_2257/1506463664.py:13: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name_english | revenue | |
|---|---|---|
| 0 | health_beauty | 1441248.07 |
| 1 | watches_gifts | 1305541.61 |
| 2 | bed_bath_table | 1241681.72 |
| 3 | sports_leisure | 1156656.48 |
| 4 | computers_accessories | 1059272.40 |
| 5 | furniture_decor | 902511.79 |
| 6 | housewares | 778397.77 |
| 7 | cool_stuff | 719329.95 |
| 8 | auto | 685384.32 |
| 9 | garden_tools | 584219.21 |
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="product_category_name_english", data=df_q3, palette="magma")
plt.title("Total revenue per category")
plt.xlabel("Revenue")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/3599276967.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
Revenue Leadership: Health & Beauty
- Health & Beauty stands as the most profitable category with a total revenue of $1,258,681.34
- This suggests a high consumer demand coupled with a strong price-per-unit ratio, making it the "star" category of the ecosystem.
High-Ticket Strategy: Watches & Gifts
- A major insight is found in Watches & Gifts. While it ranks lower in total order volume (7th place), it ranks 2nd in total revenue.
- This category relies on high-ticket items. It requires fewer sales to generate massive revenue, indicating a premium market segment.
Volume vs. Value: Bed, Bath & Table
- Bed, Bath & Table is the absolute leader in quantity of orders (over 11,000). However, it drops to 3rd place in revenue.
- This indicates that shipping costs represent a significant portion of the total transaction value for these categories, likely due to their weight or size.
Logistic Impact
- By comparing we notice that categories like Housewares and Garden Tools lose ground when freight is removed.
- This indicates that shipping costs represent a significant portion of the total transaction value for these categories, likely due to their weight or size.
Business Recommendations¶
Marketing Focus: Allocate more budget to Health & Beauty and Watches & Gifts to maximize Return on Ad Spend (ROAS).
Retention: Since Bed, Bath & Table has the most orders, use it as a "hook" for loyalty programs or cross-selling other products.
Logistics: Review shipping contracts for Housewares to reduce the "Freight Gap" and make those products more competitive.
Which states generate the highest revenue?¶
conn.rollback()
query_7 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""
df_q4 = pd.read_sql(query_7, conn)
df_q4
/tmp/ipykernel_2257/31892081.py:7: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_atorder_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
conn.rollback()
query_8 = """
SELECT *
FROM "e-commerce-project".olist_customers_dataset
LIMIT 5
"""
df_q5 = pd.read_sql(query_8, conn)
df_q5
/tmp/ipykernel_2257/3662583808.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| customer_id | customer_unique_id | customer_city | customer_state | customer_zip_code_prefix | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | franca | SP | 14409 |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | sao bernardo do campo | SP | 09790 |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | sao paulo | SP | 01151 |
| 3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | mogi das cruzes | SP | 08775 |
| 4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | campinas | SP | 13056 |
conn.rollback()
query_9 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_q6 = pd.read_sql(query_9, conn)
df_q6
/tmp/ipykernel_2257/2514142902.py:7: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 09:45:35+00:00 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 11:05:13+00:00 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 14:48:30+00:00 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 10:10:18+00:00 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 13:57:51+00:00 | 199.90 | 18.14 |
conn.rollback()
query_10 = """
SELECT c.customer_state AS state, COUNT(DISTINCT o.order_id) AS total_orders, SUM(oi.price) AS revenue
FROM "e-commerce-project".olist_customers_dataset AS c
JOIN "e-commerce-project".olist_orders_dataset AS o
ON c.customer_id = o.customer_id
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON oi.order_id = o.order_id
GROUP BY customer_state
ORDER BY revenue DESC
"""
df_q7 = pd.read_sql(query_10, conn)
df_q7
/tmp/ipykernel_2257/363883029.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| state | total_orders | revenue | |
|---|---|---|---|
| 0 | SP | 41375 | 5202955.05 |
| 1 | RJ | 12762 | 1824092.67 |
| 2 | MG | 11544 | 1585308.03 |
| 3 | RS | 5432 | 750304.02 |
| 4 | PR | 4998 | 683083.76 |
| 5 | SC | 3612 | 520553.34 |
| 6 | BA | 3358 | 511349.99 |
| 7 | DF | 2125 | 302603.94 |
| 8 | GO | 2007 | 294591.95 |
| 9 | ES | 2025 | 275037.31 |
| 10 | PE | 1648 | 262788.03 |
| 11 | CE | 1327 | 227254.71 |
| 12 | PA | 970 | 178947.81 |
| 13 | MT | 903 | 156453.53 |
| 14 | MA | 740 | 119648.22 |
| 15 | MS | 709 | 116812.64 |
| 16 | PB | 532 | 115268.08 |
| 17 | PI | 493 | 86914.08 |
| 18 | RN | 482 | 83034.98 |
| 19 | AL | 411 | 80314.81 |
| 20 | SE | 345 | 58920.85 |
| 21 | TO | 279 | 49621.74 |
| 22 | RO | 247 | 46140.64 |
| 23 | AM | 147 | 22356.84 |
| 24 | AC | 81 | 15982.95 |
| 25 | AP | 68 | 13474.30 |
| 26 | RR | 46 | 7829.43 |
import json
import requests
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson"
geojson = requests.get(url).json()
fig = px.choropleth(
df_q7,
geojson=geojson,
locations="state",
featureidkey="properties.sigla", # matches SP, RJ, MG etc
color="revenue",
hover_name="state",
hover_data=["total_orders", "revenue"],
title="Revenue by Brazilian State",
color_continuous_scale="Blues"
)
fig.update_geos(
fitbounds="locations",
visible=False
)
fig.show()
The Absolute Dominance of São Paulo
- São Paulo is not just the leader; it is the economic engine of the platform. With $5,202,955 in revenue, it generates nearly 3 times more than the second-place state (Rio de Janeiro).
- This dominance is likely due to better logistics, more concentrated seller hubs, and a higher density of e-commerce users.
Southeast Hub Centralization
- The Top 3 states (SP, RJ, MG) all belong to the Southeast Region.
- These three states combined represent the vast majority of the company's total cash flow. Business efforts are highly centralized in the industrial and financial heart of Brazil.
The Logistics Gap (North vs. South)
- States like RR (Roraima) and AP (Amapá) show the lowest activity, with fewer than 100 orders each
- The sharp decline in revenue as we move away from the South/Southeast suggests that high shipping costs (freight) and long delivery times are significant barriers for customers in remote regions.
Business Recommendations¶
Fulfillment Centers: Consider placing more "Fulfillment by Olist" centers in SP and MG to maintain dominance and reduce delivery times even further.
Subsidized Shipping: To grow the market in the North/Northeast, a strategy involving subsidized freight or regional distribution centers might be necessary to convert potential customers who are currently deterred by shipping prices.
Targeted Marketing: Focus high-conversion ad campaigns in RJ and MG, as they show high volume but still have significant room to grow compared to SP.
Who are the top-performing sellers?¶
conn.rollback()
query_11 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 10
"""
df_q8 = pd.read_sql(query_11, conn)
df_q8
/tmp/ipykernel_2257/839127881.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 09:45:35+00:00 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 11:05:13+00:00 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 14:48:30+00:00 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 10:10:18+00:00 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 13:57:51+00:00 | 199.90 | 18.14 |
| 5 | 00048cc3ae777c65dbb7d2a0634bc1ea | 1 | ef92defde845ab8450f9d70c526ef70f | 6426d21aca402a131fc0a5d0960a3c90 | 03:55:27+00:00 | 21.90 | 12.69 |
| 6 | 00054e8431b9d7675808bcb819fb4a32 | 1 | 8d4f2bb7e93e6710a28f34fa83ee7d28 | 7040e82f899a04d1b434b795a43b4617 | 12:10:31+00:00 | 19.90 | 11.85 |
| 7 | 000576fe39319847cbb9d288c5617fa6 | 1 | 557d850972a7d6f792fd18ae1400d9b6 | 5996cddab893a4652a15592fb58ab8db | 12:30:45+00:00 | 810.00 | 70.75 |
| 8 | 0005a1a1728c9d785b8e2b08b904576c | 1 | 310ae3c140ff94b03219ad0adc3c778f | a416b6a846a11724393025641d4edd5e | 18:31:29+00:00 | 145.95 | 11.65 |
| 9 | 0005f50442cb953dcd1d21e1fb923495 | 1 | 4535b0e1091c278dfd193e5a1d63b39f | ba143b05f0110f0dc71ad71b4466ce92 | 14:10:56+00:00 | 53.99 | 11.40 |
conn.rollback()
query_12 = """
SELECT *
FROM "e-commerce-project".olist_sellers_dataset
LIMIT 10
"""
df_q9 = pd.read_sql(query_12, conn)
df_q9
/tmp/ipykernel_2257/4099490292.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| seller_id | seller_zip_code_prefix | seller_city | seller_state | |
|---|---|---|---|---|
| 0 | 3442f8959a84dea7ee197c632cb2df15 | 13023 | campinas | SP |
| 1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | mogi guacu | SP |
| 2 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | rio de janeiro | RJ |
| 3 | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195 | sao paulo | SP |
| 4 | 51a04a8a6bdcb23deccc82b0b80742cf | 12914 | braganca paulista | SP |
| 5 | c240c4061717ac1806ae6ee72be3533b | 20920 | rio de janeiro | RJ |
| 6 | e49c26c3edfa46d227d5121a6b6e4d37 | 55325 | brejao | PE |
| 7 | 1b938a7ec6ac5061a66a3766e0e75f90 | 16304 | penapolis | SP |
| 8 | 768a86e36ad6aae3d03ee3c6433d61df | 1529 | sao paulo | SP |
| 9 | ccc4bbb5f32a6ab2b7066a4130f114e3 | 80310 | curitiba | PR |
conn.rollback()
query_13 = """
SELECT DISTINCT s.seller_id AS seller, sum(oi.price) AS revenue
FROM "e-commerce-project".olist_order_items_dataset AS oi
JOIN "e-commerce-project".olist_sellers_dataset AS S
ON oi.seller_id = s.seller_id
GROUP BY seller
ORDER BY revenue DESC
LIMIT 10
"""
df_q10 = pd.read_sql(query_13, conn)
df_q10
/tmp/ipykernel_2257/1905358826.py:13: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| seller | revenue | |
|---|---|---|
| 0 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 229472.63 |
| 1 | 53243585a1d6dc2643021fd1853d8905 | 222776.05 |
| 2 | 4a3ca9315b744ce9f8e9374361493884 | 200472.92 |
| 3 | fa1c13f2614d7b5c4749cbc52fecda94 | 194042.03 |
| 4 | 7c67e1448b00f6e969d365cea6b010ab | 187923.89 |
| 5 | 7e93a43ef30c4f03f38b393420bc753a | 176431.87 |
| 6 | da8622b14eb17ae2831f4ac5b9dab84a | 160236.57 |
| 7 | 7a67c85e85bb2ce8582c35f2203ad736 | 141745.53 |
| 8 | 1025f0e2d44d7041d6cf58b6550e0bfa | 138968.55 |
| 9 | 955fee9216a65b617aa5c0531780ce60 | 135171.70 |
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="seller", data=df_q10, palette="viridis")
plt.title("Top 10 Sellers by Revenue")
plt.xlabel("Total Revenue")
plt.ylabel("Seller ID")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/1324732387.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
conn.rollback()
query_14 = """
SELECT DISTINCT s.seller_id AS seller, COUNT(oi.order_id) AS total_orders
FROM "e-commerce-project".olist_order_items_dataset AS oi
JOIN "e-commerce-project".olist_sellers_dataset AS S
ON oi.seller_id = s.seller_id
GROUP BY seller
ORDER BY total_orders DESC
LIMIT 10
"""
df_q11 = pd.read_sql(query_14, conn)
df_q11
/tmp/ipykernel_2257/2051337999.py:13: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| seller | total_orders | |
|---|---|---|
| 0 | 6560211a19b47992c3666cc44a7e94c0 | 2033 |
| 1 | 4a3ca9315b744ce9f8e9374361493884 | 1987 |
| 2 | 1f50f920176fa81dab994f9023523100 | 1931 |
| 3 | cc419e0650a3c5ba77189a1882b7556a | 1775 |
| 4 | da8622b14eb17ae2831f4ac5b9dab84a | 1551 |
| 5 | 955fee9216a65b617aa5c0531780ce60 | 1499 |
| 6 | 1025f0e2d44d7041d6cf58b6550e0bfa | 1428 |
| 7 | 7c67e1448b00f6e969d365cea6b010ab | 1364 |
| 8 | ea8482cd71df3c1969d7b9473ff13abc | 1203 |
| 9 | 7a67c85e85bb2ce8582c35f2203ad736 | 1171 |
plt.figure(figsize=(12, 7))
sns.barplot(x="total_orders", y="seller", data=df_q11, palette="magma")
plt.title("Top 10 Sellers by Total Orders")
plt.xlabel("Total Orders")
plt.ylabel("Seller ID")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/2652762271.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
The "Whale" Sellers (High Revenue, High Volume)
- Seller 4a3ca9315b744ce9f8e9374361493884 is a top performer in both metrics: #2 in total orders (1,987) and #3 in revenue ($194,004).
- This seller is a cornerstone of the platform, maintaining high sales velocity while selling items with a healthy profit margin.
The "High-Frequency" Specialist
- Seller 6560211a19b47992c3666cc44a7e94c0 leads the platform in total orders with 2,033 sales. However, they do not appear as the #1 leader in revenue.
- This seller likely focuses on low-cost, high-turnover items (e.g., small accessories or household essentials). They drive massive traffic and logistics activity but with lower individual transaction values.
The "Premium" Specialist
- Seller 4869f7a5dfa277a7dca6462dcf3b52b2 is the #1 Revenue Leader ($229,472) but does not have the highest number of orders.
- This seller likely deals in high-ticket items (electronics, luxury watches, or furniture). They generate the most cash flow for the platform with fewer individual shipments.
Business Recommendations¶
Incentivize the "Whale" Sellers: Ensure sellers like 4a3ca9315b744ce9f8e9374361493884 have dedicated support, as they are the most valuable to the platform's stability.
Efficiency for High Volume: For sellers like 6560211a19b47992c3666cc44a7e94c0, the focus should be on logistics optimization. Since they ship thousands of items, even a small reduction in shipping cost or packaging time significantly increases their margin.
Risk Assessment: The marketplace is relatively healthy because the top revenue and top volume are spread across different sellers. There isn't a single seller that "owns" the entire platform, which reduces dependency risk.
What payment methods are most popular?¶
conn.rollback()
query_15 = """
SELECT *
FROM "e-commerce-project".olist_order_payments_dataset
LIMIT 5
"""
df_q12 = pd.read_sql(query_15, conn)
df_q12
/tmp/ipykernel_2257/2582401646.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | payment_sequential | payment_type | payment_installments | payment_value | |
|---|---|---|---|---|---|
| 0 | b81ef226f3fe1789b1e8b2acac839d17 | 1 | credit_card | 8 | 99.33 |
| 1 | a9810da82917af2d9aefd1278f1dcfa0 | 1 | credit_card | 1 | 24.39 |
| 2 | 25e8ea4e93396b6fa0d3dd708e76c1bd | 1 | credit_card | 1 | 65.71 |
| 3 | ba78997921bbcdc1373bb41e913ab953 | 1 | credit_card | 8 | 107.78 |
| 4 | 42fdf880ba16b47b59251dd489d4441a | 1 | credit_card | 2 | 128.45 |
conn.rollback()
query_16 = """
SELECT payment_type, COUNT(order_id) AS count, SUM(payment_value) AS revenue
FROM "e-commerce-project".olist_order_payments_dataset
GROUP BY payment_type
ORDER BY count DESC
LIMIT 4
"""
df_q13 = pd.read_sql(query_16, conn)
df_q13
/tmp/ipykernel_2257/787904435.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| payment_type | count | revenue | |
|---|---|---|---|
| 0 | credit_card | 76795 | 12542084.19 |
| 1 | boleto | 19784 | 2869361.27 |
| 2 | voucher | 5775 | 379436.87 |
| 3 | debit_card | 1529 | 217989.79 |
plt.pie(df_q13["count"], labels=df_q13["payment_type"], autopct='%1.1f%%')
plt.show()
plt.figure(figsize=(12, 7))
sns.barplot(x="revenue", y="payment_type", data=df_q13, palette="magma")
plt.title("Revenue By Payment Type")
plt.xlabel("Revenue")
plt.ylabel("Payment Type")
plt.tight_layout()
plt.show()
/tmp/ipykernel_2257/1527151366.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
Credit Card Dominance
- Credit cards are the undisputed primary payment method, accounting for the vast majority of transactions (76,795 orders) and revenue ($12,542,084.19).
- Since credit cards allow for installments (as seen in your df_q11 with up to 8 installments), they are likely the main driver for high-ticket purchases. The platform’s revenue is highly dependent on credit card processing.
The "Boleto" Factor
- The Boleto (Brazilian bank slip) is the second most popular method with 19,784 orders.
- Boleto is a cash-based payment. This indicates a significant portion of the customer base either prefers not to use credit or does not have access to a credit limit, making this method essential for financial inclusion in the Brazilian market.
Vouchers and Retention
- There were 5,775 transactions using vouchers
- Vouchers are often linked to refunds, loyalty rewards, or promotional discounts. Their presence suggests an active customer retention or compensation system within the Olist ecosystem.
Debit Card Underutilization
- Debit cards represent the smallest share (1,529 orders)
- In Brazilian e-commerce, debit cards historically had lower adoption due to security friction and the lack of installment options compared to credit cards.
Business Recommendations¶
Installment Promotions: Since Credit Card is the leader, offering "interest-free" installments for specific high-value categories (like Watches or Electronics) could further boost the average ticket.
Boleto Incentives: Consider offering a small discount (e.g., 5%) for Boleto or Debit Card payments. This reduces the transaction fees paid to credit card processors and increases immediate cash flow.
Voucher Strategy: Analyze the source of the vouchers. If they are mostly from "Returns," it might indicate a quality issue in certain categories; if they are "Promotional," they are successfully driving repeat purchases.
How long does delivery usually take?¶
conn.rollback()
query_17 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""
df_q14 = pd.read_sql(query_17, conn)
df_q14
/tmp/ipykernel_2257/3532966613.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_atorder_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
conn.rollback()
query_18 = """
SELECT order_id, order_delivered_customer_date, order_purchase_timestamp, order_delivered_customer_date - order_purchase_timestamp AS delivery_time
FROM "e-commerce-project".olist_orders_dataset
WHERE order_delivered_customer_date IS NOT NULL
ORDER BY delivery_time DESC
"""
df_q15 = pd.read_sql(query_18, conn)
df_q15
/tmp/ipykernel_2257/2167054448.py:10: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_delivered_customer_date | order_purchase_timestamp | delivery_time | |
|---|---|---|---|---|
| 0 | ca07593549f1816d26a572e06dc1eab6 | 2017-09-19 14:36:39 | 2017-02-21 23:31:27 | 209 days 15:05:12 |
| 1 | 1b3190b2dfa9d789e1f14c05b647a14a | 2018-09-19 23:24:07 | 2018-02-23 14:57:35 | 208 days 08:26:32 |
| 2 | 440d0d17af552815d15a9e41abe49359 | 2017-09-19 15:12:50 | 2017-03-07 23:59:51 | 195 days 15:12:59 |
| 3 | 2fb597c2f772eca01b1f5c561bf6cc7b | 2017-09-19 14:33:17 | 2017-03-08 18:09:02 | 194 days 20:24:15 |
| 4 | 285ab9426d6982034523a855f55a885e | 2017-09-19 14:00:04 | 2017-03-08 22:47:40 | 194 days 15:12:24 |
| ... | ... | ... | ... | ... |
| 96471 | bb5a519e352b45b714192a02ffe25681 | 2017-06-01 08:34:36 | 2017-05-31 11:11:55 | 0 days 21:22:41 |
| 96472 | 8339b608be0d84fca9d8da68b58332c3 | 2018-06-27 17:31:53 | 2018-06-26 20:48:33 | 0 days 20:43:20 |
| 96473 | f3c6775ba3d2d9fe2826f93b71f12008 | 2017-07-05 08:09:26 | 2017-07-04 11:37:47 | 0 days 20:31:39 |
| 96474 | 434cecee7d1a65fc65358a632b6f725f | 2017-05-30 08:06:56 | 2017-05-29 13:21:46 | 0 days 18:45:10 |
| 96475 | 1d893dd7ca5f77ebf5f59f0d2017eee0 | 2017-06-19 21:07:52 | 2017-06-19 08:19:45 | 0 days 12:48:07 |
96476 rows × 4 columns
df_q15["delivery_time"] = df_q15["delivery_time"].dt.days
df_q15.head()
| order_id | order_delivered_customer_date | order_purchase_timestamp | delivery_time | |
|---|---|---|---|---|
| 0 | ca07593549f1816d26a572e06dc1eab6 | 2017-09-19 14:36:39 | 2017-02-21 23:31:27 | 209 |
| 1 | 1b3190b2dfa9d789e1f14c05b647a14a | 2018-09-19 23:24:07 | 2018-02-23 14:57:35 | 208 |
| 2 | 440d0d17af552815d15a9e41abe49359 | 2017-09-19 15:12:50 | 2017-03-07 23:59:51 | 195 |
| 3 | 2fb597c2f772eca01b1f5c561bf6cc7b | 2017-09-19 14:33:17 | 2017-03-08 18:09:02 | 194 |
| 4 | 285ab9426d6982034523a855f55a885e | 2017-09-19 14:00:04 | 2017-03-08 22:47:40 | 194 |
sns.histplot(df_q15["delivery_time"], bins=30, kde=True)
plt.show()
conn.rollback()
query_19 = """
SELECT c.customer_state AS states,
AVG(o.order_delivered_customer_date - o.order_purchase_timestamp) AS avg_delivery_time
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_customers_dataset AS c
ON o.customer_id = c.customer_id
WHERE o.order_delivered_customer_date IS NOT NULL
GROUP BY states
ORDER BY avg_delivery_time DESC
"""
df_q16 = pd.read_sql(query_19, conn)
df_q16
/tmp/ipykernel_2257/1454990310.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| states | avg_delivery_time | |
|---|---|---|
| 0 | RR | 29 days 09:18:03.975610 |
| 1 | AP | 27 days 04:26:29.850746 |
| 2 | AM | 26 days 10:13:25.613793 |
| 3 | AL | 24 days 13:03:09.103275 |
| 4 | PA | 23 days 18:33:00.021142 |
| 5 | MA | 21 days 13:45:05.167364 |
| 6 | SE | 21 days 12:28:29.707463 |
| 7 | CE | 21 days 06:23:52.394058 |
| 8 | AC | 21 days 00:51:25.600000 |
| 9 | PB | 20 days 10:14:32.721470 |
| 10 | PI | 19 days 10:58:13.289916 |
| 11 | RO | 19 days 08:55:43.954732 |
| 12 | BA | 19 days 08:03:04.247236 |
| 13 | RN | 19 days 06:40:32.696203 |
| 14 | PE | 18 days 10:45:35.106717 |
| 15 | MT | 18 days 01:20:17.319413 |
| 16 | TO | 17 days 15:47:36.609490 |
| 17 | ES | 15 days 18:56:36.134336 |
| 18 | MS | 15 days 14:50:22.763195 |
| 19 | GO | 15 days 14:33:07.649464 |
| 20 | RJ | 15 days 07:26:28.563264 |
| 21 | RS | 15 days 07:12:23.863024 |
| 22 | SC | 14 days 23:01:23.299972 |
| 23 | DF | 12 days 23:13:17.884615 |
| 24 | MG | 12 days 00:14:46.320827 |
| 25 | PR | 11 days 23:47:52.704448 |
| 26 | SP | 8 days 18:16:21.207112 |
df_q16["avg_delivery_time"] = df_q16["avg_delivery_time"].dt.days
plt.figure(figsize=(12, 7))
sns.barplot(x="states", y="avg_delivery_time", data=df_q16, palette="magma")
plt.title("AVG Delivery Time Per States")
plt.xlabel("States")
plt.ylabel("AVG Delivery Time")
plt.show()
/tmp/ipykernel_2257/3687523790.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
The "São Paulo Effect"
- São Paulo has the fastest average delivery time at only 8 days.
- This efficiency confirms that SP is the logistics hub of the platform. Proximity to sellers and high-density transport routes allow for "prime-like" speeds, which explains why SP also has the highest volume of orders.
The Geographic Penalty
- There is a massive disparity in delivery times. While SP takes 8 days, states like RR (Roraima) and AP (Amapá) take nearly 30 days.
- Customers in the North and Northeast regions experience a service level that is 3x to 4x slower than those in the Southeast. This is a significant pain point for market expansion in those regions.
Delivery Time Distribution
- Histogram shows a "Long Tail." Most orders are delivered within 10–15 days, but there are extreme outliers taking over 200 days
- These 200-day outliers are "Customer Service Nightmares." Even if they are few, they heavily damage the brand's reputation and lead to chargebacks or negative reviews.
Business Recommendations¶
Regional Distribution Centers: To reduce the 29-day wait in RR and AP, Olist should consider regional warehouses (Fulfillment centers) in the North/Northeast.
Manage Expectations: The platform should provide dynamic "Estimated Delivery Dates" based on the state. A customer in RR will be less frustrated if they are told "30 days" upfront than if they expect "standard" shipping.
Outlier Investigation: Audit the orders that took 200+ days. Were they lost in the mail? Were the sellers out of stock? Solving these "edge cases" will significantly improve the overall average.
Are late deliveries affecting customer reviews?¶
conn.rollback()
query_20 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""
df_q17 = pd.read_sql(query_20, conn)
df_q17
/tmp/ipykernel_2257/2481437227.py:10: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_atorder_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
conn.rollback()
query_21 = """
SELECT *
FROM "e-commerce-project".olist_order_reviews_dataset
LIMIT 5
"""
df_q18 = pd.read_sql(query_21, conn)
df_q18
/tmp/ipykernel_2257/1178195664.py:10: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | None | None | 00:00:00+00:00 | 21:46:59+00:00 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | None | None | 00:00:00+00:00 | 03:05:13+00:00 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | None | None | 00:00:00+00:00 | 14:36:24+00:00 |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | None | Recebi bem antes do prazo estipulado. | 00:00:00+00:00 | 22:02:06+00:00 |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | None | Parabéns lojas lannister adorei comprar pela I... | 00:00:00+00:00 | 10:26:53+00:00 |
conn.rollback()
query_22 = """
SELECT o.order_delivered_customer_date, o.order_estimated_delivery_date,
CASE
WHEN o.order_delivered_customer_date > o.order_estimated_delivery_date
THEN 'delay'
ELSE 'on_time'
END AS delay,
review_score
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_order_reviews_dataset AS r
ON o.order_id = r.order_id
WHERE o.order_delivered_customer_date IS NOT NULL
"""
df_q19 = pd.read_sql(query_22, conn)
df_q19
/tmp/ipykernel_2257/2355831121.py:18: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_delivered_customer_date | order_estimated_delivery_date | delay | review_score | |
|---|---|---|---|---|
| 0 | 2017-10-10 21:25:13 | 2017-10-18 | on_time | 4 |
| 1 | 2018-08-07 15:27:45 | 2018-08-13 | on_time | 4 |
| 2 | 2018-08-17 18:06:29 | 2018-09-04 | on_time | 5 |
| 3 | 2017-12-02 00:28:42 | 2017-12-15 | on_time | 5 |
| 4 | 2018-02-16 18:17:02 | 2018-02-26 | on_time | 5 |
| ... | ... | ... | ... | ... |
| 96354 | 2017-03-17 15:08:01 | 2017-03-28 | on_time | 5 |
| 96355 | 2018-02-28 17:37:56 | 2018-03-02 | on_time | 4 |
| 96356 | 2017-09-21 11:24:17 | 2017-09-27 | on_time | 5 |
| 96357 | 2018-01-25 23:32:54 | 2018-02-15 | on_time | 2 |
| 96358 | 2018-03-16 13:08:30 | 2018-04-03 | on_time | 5 |
96359 rows × 4 columns
plt.figure(figsize=(8, 6))
sns.boxplot(x='delay', y='review_score', data=df_q19, palette='Set2')
plt.title('Distribution of Review Scores: On-Time vs. Delayed')
plt.xlabel('Delivery Status')
plt.ylabel('Review Score (1-5)')
plt.show()
/tmp/ipykernel_2257/4074529510.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
avg_scores = df_q19.groupby("delay")["review_score"].mean()
avg_scores
| review_score | |
|---|---|
| delay | |
| delay | 2.566550 |
| on_time | 4.293578 |
plt.figure(figsize=(8, 6))
sns.barplot(x='delay', y='review_score', data=avg_scores.reset_index(), palette='viridis')
plt.title('Average Review Score by Delivery Status')
plt.xlabel('Delivery Status')
plt.ylabel('Average Review Score')
plt.ylim(0, 5)
plt.show()
/tmp/ipykernel_2257/169006175.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
The "Review Penalty" of Delays
- There is a massive drop in satisfaction when an order is delayed. The Average Review Score drops from 4.29 (on-time) to 2.57 (delayed).
- A delay costs the platform nearly 1.7 points in satisfaction per order. This is the difference between a "highly recommended" service and a "poor" service.
Punctuality as a Minimum Standard
- When orders are on-time, the review distribution (Boxplot) shows a median of 5.0.
- For the Olist customer, receiving the package on or before the estimated date is the primary driver of a 5-star experience. Anything less than on-time delivery immediately jeopardizes the rating
High Variance in Delayed Orders
- The boxplot for "delayed" orders is much larger, stretching from 1 to 4 stars.
- This indicates that some customers are "forgiving" of short delays, but a significant portion immediately gives a 1-star rating. This volatility makes delayed orders highly unpredictable for brand reputation.
Business Recommendations¶
Buffer Time Strategy: Since "on-time" status is so critical, the platform could slightly increase the "Estimated Delivery Date" shown to the customer. It is better to "under-promise and over-deliver" than to miss a tight deadline.
Proactive Compensation: For orders identified as "delayed" in the system, Olist could automatically send a discount voucher before the customer leaves a review. This could "rescue" the review score from a 1 to a 3 or 4.
Prioritize High-Value Delays: Using your previous seller analysis, ensure that "Premium" (high-ticket) items never go into "delay" status, as a bad review on an expensive item carries more weight in the marketplace.
Which product categories receive the best and worst reviews?¶
conn.rollback()
query_23 = """
SELECT *
FROM "e-commerce-project".olist_order_reviews_dataset
LIMIT 5
"""
df_q20 = pd.read_sql(query_23, conn)
df_q20
/tmp/ipykernel_2257/2456288014.py:10: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | None | None | 00:00:00+00:00 | 21:46:59+00:00 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | None | None | 00:00:00+00:00 | 03:05:13+00:00 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | None | None | 00:00:00+00:00 | 14:36:24+00:00 |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | None | Recebi bem antes do prazo estipulado. | 00:00:00+00:00 | 22:02:06+00:00 |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | None | Parabéns lojas lannister adorei comprar pela I... | 00:00:00+00:00 | 10:26:53+00:00 |
conn.rollback()
query_24 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_q21 = pd.read_sql(query_24, conn)
df_q21
/tmp/ipykernel_2257/563161855.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 09:45:35+00:00 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 11:05:13+00:00 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 14:48:30+00:00 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 10:10:18+00:00 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 13:57:51+00:00 | 199.90 | 18.14 |
conn.rollback()
query_25 = """
SELECT *
FROM "e-commerce-project".olist_products_dataset
LIMIT 5
"""
df_q22 = pd.read_sql(query_25, conn)
df_q22
/tmp/ipykernel_2257/1145064856.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40 | 287 | 1 | 225 | 16 | 10 | 14 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44 | 276 | 1 | 1000 | 30 | 18 | 20 |
| 2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46 | 250 | 1 | 154 | 18 | 9 | 15 |
| 3 | cef67bcfe19066a932b7673e239eb23d | bebes | 27 | 261 | 1 | 371 | 26 | 4 | 26 |
| 4 | 9dc1a7de274444849c219cff195d0b71 | utilidades_domesticas | 37 | 402 | 4 | 625 | 20 | 17 | 13 |
conn.rollback()
query_26 = """
SELECT *
FROM "e-commerce-project".product_category_name_translation
LIMIT 5
"""
df_q23 = pd.read_sql(query_26, conn)
df_q23
/tmp/ipykernel_2257/184134894.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name | product_category_name_english | |
|---|---|---|
| 0 | beleza_saude | health_beauty |
| 1 | informatica_acessorios | computers_accessories |
| 2 | automotivo | auto |
| 3 | cama_mesa_banho | bed_bath_table |
| 4 | moveis_decoracao | furniture_decor |
conn.rollback()
query_27 = """
SELECT t.product_category_name_english, AVG(r.review_score) AS reviews
FROM "e-commerce-project".olist_order_reviews_dataset AS r
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON r.order_id = oi.order_id
JOIN "e-commerce-project".olist_products_dataset AS p
ON oi.product_id = p.product_id
JOIN "e-commerce-project".product_category_name_translation AS t
ON p.product_category_name = t.product_category_name
WHERE r.review_score IS NOT NULL
GROUP BY t.product_category_name_english
ORDER BY reviews
LIMIT 5
"""
df_q24 = pd.read_sql(query_27, conn)
df_q24
/tmp/ipykernel_2257/2711359845.py:18: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name_english | reviews | |
|---|---|---|
| 0 | security_and_services | 2.500000 |
| 1 | diapers_and_hygiene | 3.256410 |
| 2 | office_furniture | 3.493183 |
| 3 | home_comfort_2 | 3.629630 |
| 4 | fashion_male_clothing | 3.641221 |
plt.figure(figsize=(12, 7))
sns.barplot(x="reviews", y="product_category_name_english", hue="product_category_name_english", data=df_q24, palette="Reds_d", legend=False)
plt.title("Top 5 Product Categories with Worst Reviews")
plt.xlabel("Average Review Score")
plt.ylabel("Product Category (English)")
plt.xlim(1, 5) # Set x-axis limit from 1 to 5 for review scores
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 6))
sns.heatmap(df_q24.set_index('product_category_name_english')[['reviews']],
annot=True, cmap='Reds_r', fmt=".2f", linewidths=.5, cbar_kws={'label': 'Average Review Score'})
plt.title('Heatmap of Top 5 Product Categories with Worst Reviews')
plt.ylabel('Product Category (English)')
plt.xlabel('')
plt.tight_layout()
plt.show()
conn.rollback()
query_28 = """
SELECT t.product_category_name_english, AVG(r.review_score) AS reviews
FROM "e-commerce-project".olist_order_reviews_dataset AS r
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON r.order_id = oi.order_id
JOIN "e-commerce-project".olist_products_dataset AS p
ON oi.product_id = p.product_id
JOIN "e-commerce-project".product_category_name_translation AS t
ON p.product_category_name = t.product_category_name
WHERE r.review_score IS NOT NULL
GROUP BY t.product_category_name_english
ORDER BY reviews DESC
LIMIT 5
"""
df_q25 = pd.read_sql(query_28, conn)
df_q25
/tmp/ipykernel_2257/2301364224.py:18: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_category_name_english | reviews | |
|---|---|---|
| 0 | cds_dvds_musicals | 4.642857 |
| 1 | fashion_childrens_clothes | 4.500000 |
| 2 | books_general_interest | 4.446266 |
| 3 | costruction_tools_tools | 4.444444 |
| 4 | flowers | 4.419355 |
plt.figure(figsize=(12, 7))
sns.barplot(x="reviews", y="product_category_name_english", hue="product_category_name_english", data=df_q25, palette="Greens_d", legend=False)
plt.title("Top 5 Product Categories with Best Reviews")
plt.xlabel("Average Review Score")
plt.ylabel("Product Category (English)")
plt.xlim(1, 5) # Set x-axis limit from 1 to 5 for review scores
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 6))
sns.heatmap(df_q25.set_index('product_category_name_english')[['reviews']],
annot=True, cmap='Greens', fmt=".2f", linewidths=.5, cbar_kws={'label': 'Average Review Score'})
plt.title('Heatmap of Top 5 Product Categories with Best Reviews')
plt.ylabel('Product Category (English)')
plt.xlabel('')
plt.tight_layout()
plt.show()
The Red Zone: Critical Categories
- security_and_services holds the lowest average score at 2.50.
- Categories like diapers_and_hygiene and office_furniture are also struggling (scores below 3.5). These are often "high-expectation" categories where any delay or quality defect leads to immediate dissatisfaction.
- For hygiene and office products, the platform might be facing issues with bulky shipping (leading to damage) or reliability of service providers.
The Green Zone: Excellence in Niche
- cds_dvds_musicals leads with a stellar 4.64 average.
- High-scoring categories like fashion_childrens_clothes and books_general_interest (all > 4.4) share a common trait: they are generally easy to ship, standardized, and less prone to mechanical failure.
- These categories represent the "safe bets" for the platform. They drive positive sentiment and organic growth through word-of-mouth.
Business Recommendations¶
Quality Audit for Security: The score of 2.5 is a red flag. Olist should investigate if a specific seller is dragging down the average or if the category has inherent flaws (e.g., poor instructions or technical failure).
Logistics for Furniture: Since office_furniture is in the bottom 5, it correlates with our previous finding that bulky items suffer more delays. Improving the specialized "heavy-load" delivery network could raise these scores.
Cross-Sell from the Best: Use the high-satisfaction categories (like books or flowers) as "entry points" for new customers. A great first experience with a 4.6-star category builds trust before the customer tries a more complex one.
What is the average order value (AOV)?¶
conn.rollback()
query_29 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_q26 = pd.read_sql(query_29, conn)
df_q26
/tmp/ipykernel_2257/2055158174.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 09:45:35+00:00 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 11:05:13+00:00 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 14:48:30+00:00 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 10:10:18+00:00 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 13:57:51+00:00 | 199.90 | 18.14 |
conn.rollback()
query_30 = """
SELECT *
FROM "e-commerce-project".olist_orders_dataset
LIMIT 5
"""
df_q27 = pd.read_sql(query_30, conn)
df_q27
/tmp/ipykernel_2257/1789558180.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_atorder_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
conn.rollback()
query_31 = """
SELECT SUM(oi.price) AS total_revenue, COUNT(o.order_id) AS total_orders, SUM(oi.price)/COUNT(o.order_id) AS aov
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON o.order_id = oi.order_id
"""
df_q28 = pd.read_sql(query_31, conn)
df_q28
/tmp/ipykernel_2257/112653745.py:10: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| total_revenue | total_orders | aov | |
|---|---|---|---|
| 0 | 13591643.7 | 112650 | 120.653739 |
conn.rollback()
query_32 = """
SELECT TO_CHAR(o.order_purchase_timestamp, 'YYYY-MM') AS monthly, SUM(oi.price) AS total_revenue, COUNT(o.order_id) AS total_orders, SUM(oi.price)/COUNT(o.order_id) AS aov
FROM "e-commerce-project".olist_orders_dataset AS o
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON o.order_id = oi.order_id
GROUP BY monthly
ORDER BY monthly
"""
df_q29 = pd.read_sql(query_32, conn)
df_q29
/tmp/ipykernel_2257/371705559.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| monthly | total_revenue | total_orders | aov | |
|---|---|---|---|---|
| 0 | 2016-09 | 267.36 | 6 | 44.560000 |
| 1 | 2016-10 | 49507.66 | 363 | 136.384738 |
| 2 | 2016-12 | 10.90 | 1 | 10.900000 |
| 3 | 2017-01 | 120312.87 | 955 | 125.982063 |
| 4 | 2017-02 | 247303.02 | 1951 | 126.757058 |
| 5 | 2017-03 | 374344.30 | 3000 | 124.781433 |
| 6 | 2017-04 | 359927.23 | 2684 | 134.101054 |
| 7 | 2017-05 | 506071.14 | 4136 | 122.357626 |
| 8 | 2017-06 | 433038.60 | 3583 | 120.859224 |
| 9 | 2017-07 | 498031.48 | 4519 | 110.208338 |
| 10 | 2017-08 | 573971.68 | 4910 | 116.898509 |
| 11 | 2017-09 | 624401.69 | 4831 | 129.248953 |
| 12 | 2017-10 | 664219.43 | 5322 | 124.806357 |
| 13 | 2017-11 | 1010271.37 | 8665 | 116.592195 |
| 14 | 2017-12 | 743914.17 | 6308 | 117.931860 |
| 15 | 2018-01 | 950030.36 | 8208 | 115.744440 |
| 16 | 2018-02 | 844178.71 | 7672 | 110.033721 |
| 17 | 2018-03 | 983213.44 | 8217 | 119.656011 |
| 18 | 2018-04 | 996647.75 | 7975 | 124.971505 |
| 19 | 2018-05 | 996517.68 | 7925 | 125.743556 |
| 20 | 2018-06 | 865124.31 | 7078 | 122.227227 |
| 21 | 2018-07 | 895507.22 | 7092 | 126.270054 |
| 22 | 2018-08 | 854686.33 | 7248 | 117.920299 |
| 23 | 2018-09 | 145.00 | 1 | 145.000000 |
plt.figure(figsize=(14, 7))
sns.lineplot(x='monthly', y='aov', data=df_q29)
plt.title('Monthly Average Order Value (AOV) Trend')
plt.xlabel('Month')
plt.ylabel('Average Order Value')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
Consistent Exponential Growth
- Total monthly revenue grew from a mere 267 in late 2016 to nearly 1 Million per month by late 2017 and 2018.
- The platform experienced a massive scaling phase during 2017. The volume of orders increased proportionally, showing that Olist successfully acquired both new sellers and new customers at a high rate.
Stability of the Average Order Value (AOV)
- Despite the massive increase in total orders, the AOV has remained remarkably stable, fluctuating mostly between $110 and $130.
- This stability is a good sign; it means that as the platform grows, it isn't just attracting "cheap" low-value sales. It is maintaining its middle-market position. The peaks in the line chart likely represent periods where high-ticket items (like Electronics) had successful promotions.
Business Recommendations¶
AOV Growth Campaigns: Since AOV is stable but flat, Olist could implement "Upselling" strategies (e.g., "Buy 3, get 10% off") or Product Bundling to push the AOV from $120 closer to $150.
Infrastructure Readiness: Based on the November spike, the platform must ensure logistics and server capacity are doubled every October to prepare for the annual sales peak.
Low-Month Analysis: Investigate the slight dips in February or June to see if they are related to Brazilian holidays (like Carnival) and create specific "Off-Season" promotions to smooth out the revenue curve.
Which cities have the most customers?¶
conn.rollback()
query_33 = """
SELECT *
FROM "e-commerce-project".olist_customers_dataset
LIMIT 5
"""
df_q30 = pd.read_sql(query_33, conn)
df_q30
/tmp/ipykernel_2257/2706480270.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| customer_id | customer_unique_id | customer_city | customer_state | customer_zip_code_prefix | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | franca | SP | 14409 |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | sao bernardo do campo | SP | 09790 |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | sao paulo | SP | 01151 |
| 3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | mogi das cruzes | SP | 08775 |
| 4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | campinas | SP | 13056 |
conn.rollback()
query_33 = """
SELECT customer_state AS state, customer_city AS city, COUNT(customer_id) AS total_customer
FROM "e-commerce-project".olist_customers_dataset
GROUP BY state, city
ORDER BY total_customer DESC
"""
df_q31 = pd.read_sql(query_33, conn)
df_q31
/tmp/ipykernel_2257/1480094160.py:10: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| state | city | total_customer | |
|---|---|---|---|
| 0 | SP | sao paulo | 15540 |
| 1 | RJ | rio de janeiro | 6882 |
| 2 | MG | belo horizonte | 2773 |
| 3 | DF | brasilia | 2131 |
| 4 | PR | curitiba | 1521 |
| ... | ... | ... | ... |
| 4305 | ES | santa leopoldina | 1 |
| 4306 | SP | flora rica | 1 |
| 4307 | GO | teresina de goias | 1 |
| 4308 | TO | novo alegre | 1 |
| 4309 | MG | frei lagonegro | 1 |
4310 rows × 3 columns
df_states = (
df_q31
.groupby("state", as_index=False)["total_customer"]
.sum()
)
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson"
geojson = requests.get(url).json()
fig = px.choropleth(
df_states,
geojson=geojson,
locations="state",
featureidkey="properties.sigla", # matches SP, RJ, MG etc
color="total_customer",
hover_name="state",
hover_data=["total_customer"],
title="Clients by Brazilian State",
color_continuous_scale="Reds"
)
fig.update_geos(
fitbounds="locations",
visible=False
)
fig.show()
top_10_citys = df_q31.head(10)
plt.figure(figsize=(12, 7))
sns.barplot(x="total_customer", y="city", hue="city", data=top_10_citys, palette="Reds", legend=False)
plt.title("Top 10 Citys With Most Customers")
plt.xlabel("Total Customers")
plt.ylabel("citys")
plt.show()
The "Heart" of the Market: São Paulo
- São Paulo is the absolute leader with 15,540 customers, nearly triple that of the second-place city (Rio de Janeiro).
- SP is not just a market; it is the platform's core ecosystem. The high density allows for lower shipping costs and the 8-day delivery average we observed earlier.
Urban Concentration
- The top 10 cities are primarily state capitals in the Southeast and South.
- While the platform is "national," the revenue is highly regionalized. States like Amapá (AP) and Roraima (RR) show very low customer density on the choropleth map, which correlates with the 30-day delivery times previously identified.
What factors influence product price and freight cost?¶
conn.rollback()
query_34 = """
SELECT *
FROM "e-commerce-project".olist_products_dataset
LIMIT 5
"""
df_q32 = pd.read_sql(query_34, conn)
df_q32
/tmp/ipykernel_2257/348530697.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40 | 287 | 1 | 225 | 16 | 10 | 14 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44 | 276 | 1 | 1000 | 30 | 18 | 20 |
| 2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46 | 250 | 1 | 154 | 18 | 9 | 15 |
| 3 | cef67bcfe19066a932b7673e239eb23d | bebes | 27 | 261 | 1 | 371 | 26 | 4 | 26 |
| 4 | 9dc1a7de274444849c219cff195d0b71 | utilidades_domesticas | 37 | 402 | 4 | 625 | 20 | 17 | 13 |
conn.rollback()
query_35 = """
SELECT *
FROM "e-commerce-project".olist_order_items_dataset
LIMIT 5
"""
df_q33 = pd.read_sql(query_35, conn)
df_q33
/tmp/ipykernel_2257/712068128.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 09:45:35+00:00 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 11:05:13+00:00 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 14:48:30+00:00 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 10:10:18+00:00 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 13:57:51+00:00 | 199.90 | 18.14 |
conn.rollback()
query_36 = """
SELECT p.product_weight_g, oi.freight_value
FROM "e-commerce-project".olist_products_dataset AS p
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON p.product_id = oi.product_id
WHERE P.product_weight_g IS NOT NULL OR oi.freight_value IS NOT NULL
"""
df_q34 = pd.read_sql(query_36, conn)
df_q34
/tmp/ipykernel_2257/3334282439.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_weight_g | freight_value | |
|---|---|---|
| 0 | 650.0 | 13.29 |
| 1 | 30000.0 | 19.93 |
| 2 | 3050.0 | 17.87 |
| 3 | 200.0 | 12.79 |
| 4 | 3750.0 | 18.14 |
| ... | ... | ... |
| 112645 | 10150.0 | 43.41 |
| 112646 | 8950.0 | 36.53 |
| 112647 | 967.0 | 16.95 |
| 112648 | 100.0 | 8.72 |
| 112649 | 600.0 | 12.79 |
112650 rows × 2 columns
plt.figure(figsize=(10, 6))
sns.scatterplot(x='product_weight_g', y='freight_value', data=df_q34, alpha=0.6)
plt.title('Product Weight vs. Freight Value')
plt.xlabel('Product Weight (g)')
plt.ylabel('Freight Value')
plt.ylim(0, 150)
plt.tight_layout()
plt.show()
correlation = df_q34['product_weight_g'].corr(df_q34['freight_value'])
print(f"Correlation between Product Weight and Freight Value: {correlation:.2f}")
Correlation between Product Weight and Freight Value: 0.61
conn.rollback()
query_37 = """
SELECT p.product_length_cm * p.product_height_cm * p.product_width_cm AS size, oi.freight_value
FROM "e-commerce-project".olist_products_dataset AS p
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON p.product_id = oi.product_id
WHERE p.product_length_cm IS NOT NULL OR p.product_height_cm IS NOT NULL OR p.product_width_cm IS NOT NULL OR oi.freight_value IS NOT NULL
"""
df_q35 = pd.read_sql(query_37, conn)
df_q35
/tmp/ipykernel_2257/3112260560.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| size | freight_value | |
|---|---|---|
| 0 | 3528.0 | 13.29 |
| 1 | 60000.0 | 19.93 |
| 2 | 14157.0 | 17.87 |
| 3 | 2400.0 | 12.79 |
| 4 | 42000.0 | 18.14 |
| ... | ... | ... |
| 112645 | 53400.0 | 43.41 |
| 112646 | 44460.0 | 36.53 |
| 112647 | 9576.0 | 16.95 |
| 112648 | 8000.0 | 8.72 |
| 112649 | 1710.0 | 12.79 |
112650 rows × 2 columns
plt.figure(figsize=(10, 6))
sns.scatterplot(x='size', y='freight_value', data=df_q35, alpha=0.6)
plt.title('Product Size vs. Freight Value')
plt.xlabel('Product Size')
plt.ylabel('Freight Value')
plt.ylim(0, 150)
plt.tight_layout()
plt.show()
correlation = df_q35['size'].corr(df_q35['freight_value'])
print(f"Correlation between Product Size and Freight Value: {correlation:.2f}")
Correlation between Product Size and Freight Value: 0.59
conn.rollback()
query_38 = """
SELECT price, freight_value
FROM "e-commerce-project".olist_order_items_dataset
WHERE price IS NOT NULL OR freight_value IS NOT NULL
"""
df_q36 = pd.read_sql(query_38, conn)
df_q36
/tmp/ipykernel_2257/30038573.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| price | freight_value | |
|---|---|---|
| 0 | 58.90 | 13.29 |
| 1 | 239.90 | 19.93 |
| 2 | 199.00 | 17.87 |
| 3 | 12.99 | 12.79 |
| 4 | 199.90 | 18.14 |
| ... | ... | ... |
| 112645 | 299.99 | 43.41 |
| 112646 | 350.00 | 36.53 |
| 112647 | 99.90 | 16.95 |
| 112648 | 55.99 | 8.72 |
| 112649 | 43.00 | 12.79 |
112650 rows × 2 columns
plt.figure(figsize=(10, 6))
sns.scatterplot(x='price', y='freight_value', data=df_q36, alpha=0.6)
plt.title('Product Price vs. Freight Value')
plt.xlabel('Product Price')
plt.ylabel('Freight Value')
plt.ylim(0, 150)
plt.tight_layout()
plt.show()
correlation = df_q36['price'].corr(df_q36['freight_value'])
print(f"Correlation between Product Price and Freight Value: {correlation:.2f}")
Correlation between Product Price and Freight Value: 0.41
conn.rollback()
query_39 = """
SELECT
p.product_weight_g, p.product_length_cm * p.product_height_cm * p.product_width_cm AS size, oi.freight_value
FROM "e-commerce-project".olist_products_dataset AS p
JOIN "e-commerce-project".olist_order_items_dataset AS oi
ON p.product_id = oi.product_id
WHERE p.product_weight_g IS NOT NULL
OR p.product_length_cm IS NOT NULL
OR p.product_height_cm IS NOT NULL
OR p.product_width_cm IS NOT NULL
OR oi.freight_value IS NOT NULL
"""
df_q37 = pd.read_sql(query_39, conn)
df_q37
/tmp/ipykernel_2257/2218929922.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
| product_weight_g | size | freight_value | |
|---|---|---|---|
| 0 | 650.0 | 3528.0 | 13.29 |
| 1 | 30000.0 | 60000.0 | 19.93 |
| 2 | 3050.0 | 14157.0 | 17.87 |
| 3 | 200.0 | 2400.0 | 12.79 |
| 4 | 3750.0 | 42000.0 | 18.14 |
| ... | ... | ... | ... |
| 112645 | 10150.0 | 53400.0 | 43.41 |
| 112646 | 8950.0 | 44460.0 | 36.53 |
| 112647 | 967.0 | 9576.0 | 16.95 |
| 112648 | 100.0 | 8000.0 | 8.72 |
| 112649 | 600.0 | 1710.0 | 12.79 |
112650 rows × 3 columns
correlation = df_q37[['product_weight_g', 'size', 'freight_value']].corr()
print(correlation)
product_weight_g size freight_value product_weight_g 1.000000 0.802267 0.61042 size 0.802267 1.000000 0.58727 freight_value 0.610420 0.587270 1.00000
Relationship Between Product Weight and Freight Value¶
- Visual Trend: The scatter plot reveals a clear upward trend, where higher product weights generally correspond to higher freight values.
- Correlation Coefficient: The calculated correlation of 0.61 indicates a strong positive relationship.
- Observation: This strong correlation suggests that as the weight of a product increases, shipping costs rise significantly. This is likely driven by carrier pricing models that utilize weight-based brackets or volumetric weight calculations, making product weight a primary driver of logistics expenses.
Product Size (Volume) vs. Freight Value¶
1. Correlation Analysis
- The correlation coefficient between Product Size (Volume) and Freight Value is 0.59.
- This indicates a moderate positive relationship, confirming that as the physical dimensions of a product increase, the cost of shipping tends to rise.
2. Comparison with Product Weight
- When compared to the correlation between Product Weight and Freight Value (0.61), volume is a slightly weaker driver of shipping costs in this dataset.
- This suggests that while carrier space (volumetric weight) is a significant factor in pricing, the actual weight of the package remains the primary metric used by logistics partners for cost calculation.
3. Visual Observations
- The scatter plot shows a high concentration of data points for smaller volumes, with a 'long tail' for larger items.
- The variance increases for larger sizes, suggesting that for very bulky items, other factors (such as distance or specialized handling) might play a more dominant role than just the dimensions themselves.
Price vs. Freight Value Analysis¶
Correlation Assessment: The correlation coefficient between product price and freight value is 0.41. This indicates a moderate positive correlation.
Price as a Driver: While there is a visible upward trend, the 0.41 value suggests that price is a secondary driver of freight costs compared to weight (0.61) and size (0.59). Shipping costs are more heavily dictated by physical logistics than by the value of the item.
The 'Insurance Effect':
- Higher-priced items do show a slight increase in shipping fees, which likely accounts for insurance premiums or special handling fees for high-ticket goods.
- However, the significant spread in the scatter plot shows many expensive items still have low freight costs (if they are small/light), while many cheap items have high freight costs (if they are bulky).
Detailed Business Strategies and Actionable Recommendations¶
1. Logistics Optimization Strategy: Heavy & Bulky Items¶
Weight (0.61) and Size (0.59) are the primary drivers of freight cost, significantly more so than Price (0.41).
- Action: Negotiate specialized freight contracts for 'Heavy Load' categories (Furniture, Garden Tools). Moving away from general postal rates to dedicated LTL (Less-Than-Truckload) carriers for items over 10kg or specific volumetric dimensions can reduce costs by 15-20%.
- Target: Office Furniture and Housewares, which currently suffer from high freight-to-price ratios.
2. 'Flat-Rate' Competitiveness for High-Value Items¶
High-ticket items like Watches and Health & Beauty have a lower correlation with freight costs relative to their value.
- Action: Implement a Flat-Rate Shipping model for items weighing under 1kg with a value exceeding $200.
- Impact: This increases the competitiveness of premium sellers and encourages customers to add more high-value, low-weight items to their carts without the friction of calculated shipping.
3. Mitigating the 'Geographic Penalty' via Regional Hubs¶
Northern states (RR, AP, AM) experience 3x-4x longer delivery times (up to 30 days) and higher costs compared to São Paulo (8 days).
- Action: Establish Regional Fulfillment Centers in the Northeast (e.g., Fortaleza or Recife).
- Impact: By pre-positioning top-selling products (identified in the 'Bed, Bath & Table' volume analysis) in these regions, Olist can reduce delivery times to under 10 days and slash long-haul freight costs, unlocking growth in currently underserved markets.
4. Packaging Efficiency & Volumetric Reduction¶
There is a high collinearity (0.80) between product size and weight.
- Action: Launch a 'Sustainable Packaging Initiative' for sellers. Provide guidelines and subsidized eco-friendly, right-sized packaging to reduce 'dead air' in shipments.
- Impact: Reducing the volumetric weight of bulky items directly impacts the 0.59 correlation factor of size-to-freight, immediately improving margins for both the platform and the sellers.
5. Buffer-Time & Review Score Protection¶
Late deliveries cause a massive drop in satisfaction (4.29 to 2.57 avg score).
- Action: Implement a 'Dynamic Buffer' in the checkout process. For regions with high delivery variance (North/Northeast), add 3-5 days to the Estimated Delivery Date while simultaneously triggering proactive discount vouchers if a delay is detected in the carrier's API.
- Impact: Under-promising and over-delivering will protect the platform's brand reputation and reduce 1-star reviews.