!pip install psycopg2-binary
import psycopg2
from google.colab import userdata
password_post = userdata.get('postpass')
conn = psycopg2.connect(
host="6.tcp.ngrok.io",
port=12973,
database="test_db",
user="postgres",
password=password_post
)
cursor = conn.cursor()
cursor.execute("SELECT version();")
print(cursor.fetchone())
Collecting psycopg2-binary Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (4.9 kB) Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.2/4.2 MB 56.9 MB/s eta 0:00:00 Installing collected packages: psycopg2-binary Successfully installed psycopg2-binary-2.9.11 ('PostgreSQL 18.1 on x86_64-windows, compiled by msvc-19.44.35221, 64-bit',)
conn.rollback()
cursor.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'Lending_Club_Loan_Project';
""")
print(cursor.fetchall())
[('rejected_2007_to_2018',), ('accepted_2007_to_2018',)]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
conn.rollback()
query_1 = """
SELECT *
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
LIMIT 5
"""
df_1 = pd.read_sql(query_1, conn)
df_1
/tmp/ipykernel_3790/3215516387.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_1 = pd.read_sql(query_1, conn)
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | ... | hardship_payoff_balance_amount | hardship_last_payment_amount | disbursement_method | debt_settlement_flag | debt_settlement_flag_date | settlement_status | settlement_date | settlement_amount | settlement_percentage | settlement_term | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 143506724.0 | None | 10000.0 | 10000.0 | 10000.0 | 36 months | 19.92 | 371.23 | D | D3 | ... | None | None | Cash | N | None | None | None | None | None | None |
| 1 | 143181109.0 | None | 28500.0 | 28500.0 | 28500.0 | 60 months | 11.80 | 631.10 | B | B4 | ... | None | None | Cash | N | None | None | None | None | None | None |
| 2 | 143512446.0 | None | 8000.0 | 8000.0 | 8000.0 | 36 months | 11.31 | 263.09 | B | B3 | ... | None | None | Cash | N | None | None | None | None | None | None |
| 3 | 143288348.0 | None | 7500.0 | 7500.0 | 7500.0 | 36 months | 20.89 | 282.14 | D | D4 | ... | None | None | DirectPay | N | None | None | None | None | None | None |
| 4 | 143223161.0 | None | 30750.0 | 30750.0 | 30750.0 | 36 months | 23.40 | 1196.75 | E | E1 | ... | None | None | Cash | N | None | None | None | None | None | None |
5 rows × 151 columns
Credit Risk & Default Analysis¶
1. What is the Default Rate by Credit Grade?¶
conn.rollback()
query_2 = """
SELECT
grade,
sub_grade,
COUNT(id) AS total_loans,
ROUND(
(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100.0) / COUNT(id),
2
) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY grade, sub_grade
ORDER BY grade;
"""
df_2 = pd.read_sql(query_2, conn)
df_2
/tmp/ipykernel_3790/1170315299.py:17: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_2 = pd.read_sql(query_2, conn)
| grade | sub_grade | total_loans | default_rate | |
|---|---|---|---|---|
| 0 | A | A1 | 86790 | 1.62 |
| 1 | A | A2 | 69559 | 2.49 |
| 2 | A | A3 | 73184 | 2.86 |
| 3 | A | A4 | 95874 | 3.74 |
| 4 | A | A5 | 107612 | 5.00 |
| 5 | B | B1 | 125326 | 5.92 |
| 6 | B | B2 | 126610 | 6.64 |
| 7 | B | B3 | 131502 | 8.08 |
| 8 | B | B4 | 139770 | 8.83 |
| 9 | B | B5 | 140264 | 9.83 |
| 10 | C | C1 | 145881 | 11.13 |
| 11 | C | C2 | 131078 | 12.52 |
| 12 | C | C3 | 129159 | 13.08 |
| 13 | C | C4 | 127087 | 14.59 |
| 14 | C | C5 | 116700 | 15.05 |
| 15 | D | D1 | 81752 | 17.46 |
| 16 | D | D2 | 72856 | 18.21 |
| 17 | D | D3 | 64782 | 18.54 |
| 18 | D | D4 | 56853 | 20.25 |
| 19 | D | D5 | 47984 | 20.83 |
| 20 | E | E1 | 33537 | 25.54 |
| 21 | E | E2 | 29887 | 26.85 |
| 22 | E | E3 | 26673 | 26.69 |
| 23 | E | E4 | 22729 | 27.74 |
| 24 | E | E5 | 22655 | 26.58 |
| 25 | F | F1 | 13392 | 31.49 |
| 26 | F | F2 | 9288 | 35.14 |
| 27 | F | F3 | 7772 | 35.31 |
| 28 | F | F4 | 6104 | 38.12 |
| 29 | F | F5 | 5151 | 37.64 |
| 30 | G | G1 | 4093 | 35.04 |
| 31 | G | G2 | 2678 | 39.10 |
| 32 | G | G3 | 2084 | 39.78 |
| 33 | G | G4 | 1692 | 38.71 |
| 34 | G | G5 | 1549 | 38.41 |
plt.figure(figsize=(10, 6))
sns.barplot(x='grade', y='default_rate', data=df_2, palette='viridis')
plt.title('Average Default Rate by Credit Grade')
plt.xlabel('Credit Grade')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
/tmp/ipykernel_3790/2743319193.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='grade', y='default_rate', data=df_2, palette='viridis')
The grading system is highly accurate. Default rates climb steadily from 1.62% (A1) to nearly 40% (G3). A critical "risk cliff" exists between Grade D (19%) and Grade E (26%), indicating a significant shift in borrower stability at this threshold.
Recommendation: Implement an "Early Warning System" for Grades D and E. Since 1 in 4 of these borrowers will default, the collections department should initiate proactive, soft-touch outreach as soon as a payment is 1 day past due.
2. How does the Debt-to-Income (DTI) ratio influence default?¶
conn.rollback()
query_3 = """
SELECT
CASE
WHEN dti <= 20.00 THEN 'low_dti'
WHEN dti > 20.00 AND dti <= 40.00 THEN 'mid_dti'
ELSE 'high_dti'
END AS dti_range,
ROUND(
(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100.0) / COUNT(id),
2
) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY dti_range
ORDER BY default_rate;
"""
df_3 = pd.read_sql(query_3, conn)
df_3
/tmp/ipykernel_3790/1285300349.py:19: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_3 = pd.read_sql(query_3, conn)
| dti_range | default_rate | |
|---|---|---|
| 0 | high_dti | 7.40 |
| 1 | low_dti | 10.29 |
| 2 | mid_dti | 14.38 |
plt.figure(figsize=(10, 6))
sns.barplot(x='dti_range', y='default_rate', data=df_3, palette='viridis')
plt.title('Average Default Rate by Credit Grade')
plt.xlabel('dti_range')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
/tmp/ipykernel_3790/4033595918.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='dti_range', y='default_rate', data=df_3, palette='viridis')
The relationship is non-linear. The Mid-DTI group (20-40) is the riskiest (14.38% default), while the High-DTI group (>40) is ironically the safest (7.40%). This suggests a "selection bias" where high-DTI loans are only approved for borrowers with otherwise flawless credit profiles.
Recommendation: Tighten credit score requirements specifically for the 20-40 DTI range, as this appears to be the "financial tipping point" where borrowers are most vulnerable to economic shocks.
3. Does home ownership status affect the probability of repayment?¶
conn.rollback()
query_4 = """
SELECT home_ownership,
ROUND(
(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100.0) / COUNT(id),
2
) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY home_ownership
ORDER BY default_rate;
"""
df_4 = pd.read_sql(query_4, conn)
df_4
/tmp/ipykernel_3790/945423556.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_4 = pd.read_sql(query_4, conn)
| home_ownership | default_rate | |
|---|---|---|
| 0 | ANY | 5.62 |
| 1 | MORTGAGE | 10.31 |
| 2 | OWN | 11.80 |
| 3 | NONE | 13.21 |
| 4 | RENT | 13.87 |
| 5 | OTHER | 15.79 |
plt.figure(figsize=(10, 6))
sns.barplot(x='home_ownership', y='default_rate', data=df_4, palette='viridis')
plt.title('Average Default Rate by home_ownership')
plt.xlabel('home_ownership')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
/tmp/ipykernel_3790/2456128957.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='home_ownership', y='default_rate', data=df_4, palette='viridis')
Stability is key. Borrowers with a MORTGAGE (10.31% default) are significantly safer than those who RENT (13.87%). This 3.5% gap represents a major opportunity for risk-adjusted pricing.
Recommendation: Use homeownership as a "limit booster." Mortgage holders should be eligible for higher credit limits, while Renters should start with lower "test" limits until they prove 12 months of consistent payment history.
Profitability & Interest Rates¶
1. What is the average interest rate charged by loan purpose?¶
conn.rollback()
query_5 = """
SELECT purpose, AVG(int_rate) AS int_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY purpose
ORDER BY int_rate
"""
df_5 = pd.read_sql(query_5, conn)
df_5
/tmp/ipykernel_3790/2700889306.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_5 = pd.read_sql(query_5, conn)
| purpose | int_rate | |
|---|---|---|
| 0 | credit_card | 11.696088 |
| 1 | educational | 11.958036 |
| 2 | car | 12.181766 |
| 3 | home_improvement | 12.620322 |
| 4 | major_purchase | 12.762865 |
| 5 | vacation | 13.451090 |
| 6 | debt_consolidation | 13.522794 |
| 7 | medical | 13.631922 |
| 8 | wedding | 14.152037 |
| 9 | other | 14.235124 |
| 10 | house | 14.395558 |
| 11 | renewable_energy | 14.727548 |
| 12 | moving | 14.738670 |
| 13 | small_business | 15.263895 |
plt.figure(figsize=(12, 7))
sns.barplot(x='int_rate', y='purpose', data=df_5.sort_values(by='int_rate', ascending=False), palette='viridis')
plt.title('Average Interest Rate by Loan Purpose')
plt.xlabel('Average Interest Rate (%)')
plt.ylabel('Loan Purpose')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
/tmp/ipykernel_3790/4226815170.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='int_rate', y='purpose', data=df_5.sort_values(by='int_rate', ascending=False), palette='viridis')
Credit card refinancing carries the lowest rates (11.70%), while Small Business loans carry the highest (15.26%). This reflects the high mortality rate of new ventures compared to stabilizing existing debt.
Recommendation: Aggressively market "Credit Card Refinance" to "Debt Consolidation" borrowers. By shifting them into the lower-rate category with stricter documentation, you can capture higher-quality volume from competitors.
2. What is the total loss from "Charged Off" loans vs. total amount funded?¶
conn.rollback()
query_6 = """
SELECT
COUNT(id) AS total_loans,
SUM(funded_amnt) AS total_funded_amnt,
COUNT(CASE WHEN loan_status = 'Charged Off' THEN id END) AS num_of_loans_loses,
SUM(CASE WHEN loan_status = 'Charged Off' THEN loan_amnt ELSE 0 END) AS total_loses
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018;
"""
df_6 = pd.read_sql(query_6, conn)
df_6
/tmp/ipykernel_3790/4183277396.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_6 = pd.read_sql(query_6, conn)
| total_loans | total_funded_amnt | num_of_loans_loses | total_loses | |
|---|---|---|---|---|
| 0 | 2259907 | 3.399717e+10 | 268559 | 4.180136e+09 |
conn.rollback()
query_7 = """
SELECT
issue_d,
COUNT(id) AS total_loans,
SUM(funded_amnt) AS total_funded_amnt,
COUNT(CASE WHEN loan_status = 'Charged Off' THEN id END) AS num_of_loans_loses,
SUM(CASE WHEN loan_status = 'Charged Off' THEN loan_amnt ELSE 0 END) AS total_loses
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY issue_d
ORDER BY issue_d;
"""
df_7 = pd.read_sql(query_7, conn)
df_7["issue_d"] = pd.to_datetime(df_7["issue_d"], format="%b-%Y")
df_7["issue_d"] = df_7["issue_d"].dt.strftime('%b-%Y')
df_7
/tmp/ipykernel_3790/3613905381.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_7 = pd.read_sql(query_7, conn)
| issue_d | total_loans | total_funded_amnt | num_of_loans_loses | total_loses | |
|---|---|---|---|---|---|
| 0 | Apr-2008 | 223 | 1580975.0 | 27 | 331700.0 |
| 1 | Apr-2009 | 324 | 2981875.0 | 40 | 417200.0 |
| 2 | Apr-2010 | 893 | 9311075.0 | 82 | 830550.0 |
| 3 | Apr-2011 | 1563 | 17520000.0 | 222 | 2877750.0 |
| 4 | Apr-2012 | 3230 | 42051125.0 | 496 | 6970325.0 |
| ... | ... | ... | ... | ... | ... |
| 134 | Sep-2014 | 10606 | 159444175.0 | 1879 | 29054400.0 |
| 135 | Sep-2015 | 28641 | 450246800.0 | 5180 | 83119550.0 |
| 136 | Sep-2016 | 28144 | 399512400.0 | 4203 | 64750825.0 |
| 137 | Sep-2017 | 39713 | 601398225.0 | 3347 | 54370925.0 |
| 138 | Sep-2018 | 39026 | 635394775.0 | 173 | 2726500.0 |
139 rows × 5 columns
plt.figure(figsize=(15, 6))
sns.lineplot(x='issue_d', y='total_loses', data=df_7)
plt.title('Total Losses by Issue Date (Month-Year)')
plt.xlabel('Issue Date')
plt.ylabel('Total Losses ($)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
df_7['issue_d'] = pd.to_datetime(df_7['issue_d'], format='%b-%Y')
df_7['loss_rate'] = (df_7['total_loses'] / df_7['total_funded_amnt']) * 100
plt.figure(figsize=(15, 6))
sns.lineplot(x='issue_d', y='loss_rate', data=df_7)
plt.title('Loss Rate by Issue Date (Month-Year)')
plt.xlabel('Issue Date')
plt.ylabel('Loss Rate (%)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
The portfolio has "Charged Off" 4.18 Billion out of 33.9 Billion funded. Loss rates peaked at 18-19% during the aggressive 2014-2015 expansion phase.
Recommendation: Debt Monetization. The company should establish a secondary pipeline to sell this 4.18B in bad debt to external recovery firms. Even at a 3% recovery price, this would generate 125 Million in immediate cash flow with zero marketing cost.
Customer Profile & Geography¶
1. What is the relationship between employment length and loan amount?¶
conn.rollback()
query_8 = """
SELECT
emp_length,
COUNT(id) AS total_loans,
ROUND(AVG(loan_amnt), 2) AS avg_loan_amount,
MIN(loan_amnt) AS min_loan_amount,
MAX(loan_amnt) AS max_loan_amount,
SUM(loan_amnt) AS total_volume
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
WHERE emp_length IS NOT NULL AND emp_length != 'n/a'
GROUP BY emp_length
ORDER BY
CASE
WHEN emp_length = '< 1 year' THEN 0
WHEN emp_length = '1 year' THEN 1
WHEN emp_length = '2 years' THEN 2
WHEN emp_length = '3 years' THEN 3
WHEN emp_length = '4 years' THEN 4
WHEN emp_length = '5 years' THEN 5
WHEN emp_length = '6 years' THEN 6
WHEN emp_length = '7 years' THEN 7
WHEN emp_length = '8 years' THEN 8
WHEN emp_length = '9 years' THEN 9
WHEN emp_length = '10+ years' THEN 10
ELSE 11
END;
"""
df_8 = pd.read_sql(query_8, conn)
df_8
/tmp/ipykernel_3790/2863495014.py:31: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_8 = pd.read_sql(query_8, conn)
| emp_length | total_loans | avg_loan_amount | min_loan_amount | max_loan_amount | total_volume | |
|---|---|---|---|---|---|---|
| 0 | < 1 year | 189878 | 14823.71 | 500.0 | 40000.0 | 2.814697e+09 |
| 1 | 1 year | 148312 | 14077.03 | 500.0 | 40000.0 | 2.087793e+09 |
| 2 | 2 years | 203594 | 14368.63 | 500.0 | 40000.0 | 2.925367e+09 |
| 3 | 3 years | 180681 | 14499.96 | 500.0 | 40000.0 | 2.619868e+09 |
| 4 | 4 years | 136549 | 14601.01 | 900.0 | 40000.0 | 1.993753e+09 |
| 5 | 5 years | 139648 | 14804.89 | 1000.0 | 40000.0 | 2.067473e+09 |
| 6 | 6 years | 102585 | 14973.37 | 1000.0 | 40000.0 | 1.536043e+09 |
| 7 | 7 years | 92663 | 15148.59 | 500.0 | 40000.0 | 1.403714e+09 |
| 8 | 8 years | 91882 | 15333.95 | 1000.0 | 40000.0 | 1.408914e+09 |
| 9 | 9 years | 79374 | 15414.29 | 1000.0 | 40000.0 | 1.223494e+09 |
| 10 | 10+ years | 747849 | 16243.37 | 600.0 | 40000.0 | 1.214759e+10 |
plt.figure(figsize=(12, 7))
sns.barplot(x='emp_length', y='avg_loan_amount', data=df_8, palette='viridis')
plt.title('Average Loan Amount by Employment Length')
plt.xlabel('Employment Length')
plt.ylabel('Average Loan Amount ($)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
/tmp/ipykernel_3790/2142009232.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='emp_length', y='avg_loan_amount', data=df_8, palette='viridis')
Borrowers with 10+ years of employment are the anchor of the business, holding the largest volume ($12.1B) and the highest average loans. Conversely, borrowers with <1 year are receiving loans similar in size to those with 5 years of tenure—a high-risk acquisition strategy.
Recommendation: Launch a "Veteran Loyalty Program." Offer automatic rate reductions or fee waivers for 10+ year employees on their second or third loan to prevent them from churning to traditional banks.
2. Which states carry the highest credit risk?¶
conn.rollback()
query_9 = """
SELECT
addr_state,
COUNT(id) AS total_loans,
ROUND(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100 / COUNT(id), 2) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY addr_state
ORDER BY default_rate DESC
LIMIT 5
"""
df_9 = pd.read_sql(query_9, conn)
df_9
/tmp/ipykernel_3790/532320389.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_9 = pd.read_sql(query_9, conn)
| addr_state | total_loans | default_rate | |
|---|---|---|---|
| 0 | AL | 27276 | 14.0 |
| 1 | AR | 17068 | 14.0 |
| 2 | MS | 12635 | 13.0 |
| 3 | LA | 25754 | 13.0 |
| 4 | NM | 11983 | 13.0 |
plt.figure(figsize=(10, 6))
sns.barplot(x='addr_state', y='default_rate', data=df_9, palette='magma')
plt.title('Top 5 States by Default Rate')
plt.xlabel('State')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
/tmp/ipykernel_3790/620259226.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='addr_state', y='default_rate', data=df_9, palette='magma')
Alabama (AL) and Arkansas (AR) lead in credit risk with a 14% default rate. Southern states dominate the top 5 riskiest geographies.
Recommendation: Apply Geographic Risk Premiums. Loans originating in these top 5 states should include a 1-2% interest rate surcharge to offset the higher regional probability of loss.
3. Is there a clear difference in annual income between payers and non-payers?¶
conn.rollback()
query_10 = """
SELECT
CASE
WHEN annual_inc <= 35000.00 THEN 'low_salary'
WHEN annual_inc <= 60000.00 THEN 'mid_low_salary'
WHEN annual_inc <= 120000.00 THEN 'mid_salary'
WHEN annual_inc <= 250000.00 THEN 'mid_high_salary'
WHEN annual_inc <= 500000.00 THEN 'high_salary'
ELSE 'elite'
END AS salary_range,
COUNT(CASE WHEN loan_status = 'Charged Off' THEN 1 END) AS unpaid,
COUNT(CASE WHEN loan_status = 'Fully Paid' THEN 1 END) AS paid,
ROUND(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 END) * 100 / COUNT(*), 2) AS unpaid_porcentage,
ROUND(SUM(CASE WHEN loan_status = 'Fully Paid'THEN 1 END) * 100 / COUNT(*), 2) AS paid_porcentage
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
WHERE loan_status IN ('Charged Off', 'Fully Paid')
GROUP BY salary_range
ORDER BY paid_porcentage;
"""
df_10 = pd.read_sql(query_10, conn)
df_10
/tmp/ipykernel_3790/862983782.py:23: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_10 = pd.read_sql(query_10, conn)
| salary_range | unpaid | paid | unpaid_porcentage | paid_porcentage | |
|---|---|---|---|---|---|
| 0 | low_salary | 38455 | 121958 | 23.0 | 76.0 |
| 1 | mid_low_salary | 99888 | 357587 | 21.0 | 78.0 |
| 2 | mid_salary | 107493 | 471207 | 18.0 | 81.0 |
| 3 | mid_high_salary | 20975 | 114300 | 15.0 | 84.0 |
| 4 | high_salary | 1527 | 10171 | 13.0 | 86.0 |
| 5 | elite | 221 | 1528 | 12.0 | 87.0 |
df_10_melted = df_10.melt(id_vars=['salary_range'], value_vars=['unpaid_porcentage', 'paid_porcentage'], var_name='loan_status_type', value_name='percentage')
plt.figure(figsize=(12, 7))
sns.barplot(x='salary_range', y='percentage', hue='loan_status_type', data=df_10_melted, palette={'unpaid_porcentage': 'salmon', 'paid_porcentage': 'lightgreen'})
plt.title('Loan Status Percentage by Salary Range')
plt.xlabel('Salary Range')
plt.ylabel('Percentage (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Loan Status', labels=['Unpaid', 'Paid'])
plt.tight_layout()
plt.show()
Income is a definitive success predictor. There is an 11-percentage-point gap in repayment. The "Low Salary" group defaults at 23%, while the "Elite" group defaults at only 12%.
Recommendation: Zero-Friction Approvals for "Elite" Borrowers. To prevent high-net-worth clients from dropping out of the sales funnel, the approval process for the "Elite" and "High" salary brackets should be fully automated and instant, as their risk profile is the best in the entire portfolio.
Time Series & Trends:¶
1. How has the volume of issued loans evolved year-over-year?¶
conn.rollback()
query_11 = """
SELECT
RIGHT(issue_d, 4) AS loan_year,
COUNT(id) AS number_of_loans,
SUM(loan_amnt) AS total_amount_funded,
ROUND(AVG(loan_amnt), 2) AS avg_loan_amount
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY loan_year
ORDER BY loan_year
"""
df_11 = pd.read_sql(query_11, conn)
df_11
/tmp/ipykernel_3790/2543537009.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. df_11 = pd.read_sql(query_11, conn)
| loan_year | number_of_loans | total_amount_funded | avg_loan_amount | |
|---|---|---|---|---|
| 0 | 2007 | 490 | 3.913650e+06 | 7987.04 |
| 1 | 2008 | 2144 | 1.887440e+07 | 8803.36 |
| 2 | 2009 | 5152 | 5.055665e+07 | 9813.01 |
| 3 | 2010 | 12267 | 1.294226e+08 | 10550.47 |
| 4 | 2011 | 21721 | 2.616838e+08 | 12047.50 |
| 5 | 2012 | 53367 | 7.184110e+08 | 13461.71 |
| 6 | 2013 | 134814 | 1.982765e+09 | 14707.41 |
| 7 | 2014 | 235629 | 3.503840e+09 | 14870.16 |
| 8 | 2015 | 421095 | 6.417608e+09 | 15240.29 |
| 9 | 2016 | 434407 | 6.400570e+09 | 14734.04 |
| 10 | 2017 | 443579 | 6.584957e+09 | 14845.06 |
| 11 | 2018 | 495242 | 7.936263e+09 | 16025.02 |
plt.figure(figsize=(14, 7))
# Plot Number of Loans
plt.subplot(2, 1, 1) # 2 rows, 1 column, first plot
sns.lineplot(x='loan_year', y='number_of_loans', data=df_11, marker='o', color='skyblue')
plt.title('Number of Loans Issued Year-over-Year')
plt.xlabel('Year')
plt.ylabel('Number of Loans')
plt.grid(True, linestyle='--', alpha=0.7)
# Plot Total Amount Funded
plt.subplot(2, 1, 2) # 2 rows, 1 column, second plot
sns.lineplot(x='loan_year', y='total_amount_funded', data=df_11, marker='o', color='lightcoral')
plt.title('Total Amount Funded Year-over-Year')
plt.xlabel('Year')
plt.ylabel('Total Funded Amount ($)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()