In [2]:
!pip install psycopg2-binary
import psycopg2
from google.colab import userdata
password_post = userdata.get('postpass')
conn = psycopg2.connect(
host="6.tcp.ngrok.io",
port=12973,
database="test_db",
user="postgres",
password=password_post
)

cursor = conn.cursor()

cursor.execute("SELECT version();")

print(cursor.fetchone())
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.2/4.2 MB 56.9 MB/s eta 0:00:00
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.11
('PostgreSQL 18.1 on x86_64-windows, compiled by msvc-19.44.35221, 64-bit',)
In [3]:
conn.rollback()

cursor.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'Lending_Club_Loan_Project';
""")

print(cursor.fetchall())
[('rejected_2007_to_2018',), ('accepted_2007_to_2018',)]
In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
conn.rollback()

query_1 = """
SELECT *
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
LIMIT 5
"""
df_1 = pd.read_sql(query_1, conn)
df_1
/tmp/ipykernel_3790/3215516387.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_1 = pd.read_sql(query_1, conn)
Out[5]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... hardship_payoff_balance_amount hardship_last_payment_amount disbursement_method debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date settlement_amount settlement_percentage settlement_term
0 143506724.0 None 10000.0 10000.0 10000.0 36 months 19.92 371.23 D D3 ... None None Cash N None None None None None None
1 143181109.0 None 28500.0 28500.0 28500.0 60 months 11.80 631.10 B B4 ... None None Cash N None None None None None None
2 143512446.0 None 8000.0 8000.0 8000.0 36 months 11.31 263.09 B B3 ... None None Cash N None None None None None None
3 143288348.0 None 7500.0 7500.0 7500.0 36 months 20.89 282.14 D D4 ... None None DirectPay N None None None None None None
4 143223161.0 None 30750.0 30750.0 30750.0 36 months 23.40 1196.75 E E1 ... None None Cash N None None None None None None

5 rows × 151 columns

Credit Risk & Default Analysis¶

1. What is the Default Rate by Credit Grade?¶

In [6]:
conn.rollback()

query_2 = """
SELECT
    grade,
    sub_grade,
    COUNT(id) AS total_loans,
    ROUND(
        (SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100.0) / COUNT(id),
        2
    ) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY grade, sub_grade
ORDER BY grade;
"""

df_2 = pd.read_sql(query_2, conn)
df_2
/tmp/ipykernel_3790/1170315299.py:17: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_2 = pd.read_sql(query_2, conn)
Out[6]:
grade sub_grade total_loans default_rate
0 A A1 86790 1.62
1 A A2 69559 2.49
2 A A3 73184 2.86
3 A A4 95874 3.74
4 A A5 107612 5.00
5 B B1 125326 5.92
6 B B2 126610 6.64
7 B B3 131502 8.08
8 B B4 139770 8.83
9 B B5 140264 9.83
10 C C1 145881 11.13
11 C C2 131078 12.52
12 C C3 129159 13.08
13 C C4 127087 14.59
14 C C5 116700 15.05
15 D D1 81752 17.46
16 D D2 72856 18.21
17 D D3 64782 18.54
18 D D4 56853 20.25
19 D D5 47984 20.83
20 E E1 33537 25.54
21 E E2 29887 26.85
22 E E3 26673 26.69
23 E E4 22729 27.74
24 E E5 22655 26.58
25 F F1 13392 31.49
26 F F2 9288 35.14
27 F F3 7772 35.31
28 F F4 6104 38.12
29 F F5 5151 37.64
30 G G1 4093 35.04
31 G G2 2678 39.10
32 G G3 2084 39.78
33 G G4 1692 38.71
34 G G5 1549 38.41
In [7]:
plt.figure(figsize=(10, 6))
sns.barplot(x='grade', y='default_rate', data=df_2, palette='viridis')
plt.title('Average Default Rate by Credit Grade')
plt.xlabel('Credit Grade')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
/tmp/ipykernel_3790/2743319193.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='grade', y='default_rate', data=df_2, palette='viridis')
No description has been provided for this image

The grading system is highly accurate. Default rates climb steadily from 1.62% (A1) to nearly 40% (G3). A critical "risk cliff" exists between Grade D (19%) and Grade E (26%), indicating a significant shift in borrower stability at this threshold.

Recommendation: Implement an "Early Warning System" for Grades D and E. Since 1 in 4 of these borrowers will default, the collections department should initiate proactive, soft-touch outreach as soon as a payment is 1 day past due.

2. How does the Debt-to-Income (DTI) ratio influence default?¶

In [8]:
conn.rollback()

query_3 = """
SELECT
  CASE
    WHEN dti <= 20.00 THEN 'low_dti'
    WHEN dti > 20.00 AND dti <= 40.00 THEN 'mid_dti'
    ELSE 'high_dti'
  END AS dti_range,
ROUND(
        (SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100.0) / COUNT(id),
        2
    ) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY dti_range
ORDER BY default_rate;
"""

df_3 = pd.read_sql(query_3, conn)
df_3
/tmp/ipykernel_3790/1285300349.py:19: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_3 = pd.read_sql(query_3, conn)
Out[8]:
dti_range default_rate
0 high_dti 7.40
1 low_dti 10.29
2 mid_dti 14.38
In [9]:
plt.figure(figsize=(10, 6))
sns.barplot(x='dti_range', y='default_rate', data=df_3, palette='viridis')
plt.title('Average Default Rate by Credit Grade')
plt.xlabel('dti_range')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
/tmp/ipykernel_3790/4033595918.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='dti_range', y='default_rate', data=df_3, palette='viridis')
No description has been provided for this image

The relationship is non-linear. The Mid-DTI group (20-40) is the riskiest (14.38% default), while the High-DTI group (>40) is ironically the safest (7.40%). This suggests a "selection bias" where high-DTI loans are only approved for borrowers with otherwise flawless credit profiles.

Recommendation: Tighten credit score requirements specifically for the 20-40 DTI range, as this appears to be the "financial tipping point" where borrowers are most vulnerable to economic shocks.

3. Does home ownership status affect the probability of repayment?¶

In [10]:
conn.rollback()

query_4 = """
SELECT home_ownership,
ROUND(
        (SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100.0) / COUNT(id),
        2
    ) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY home_ownership
ORDER BY default_rate;
"""

df_4 = pd.read_sql(query_4, conn)
df_4
/tmp/ipykernel_3790/945423556.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_4 = pd.read_sql(query_4, conn)
Out[10]:
home_ownership default_rate
0 ANY 5.62
1 MORTGAGE 10.31
2 OWN 11.80
3 NONE 13.21
4 RENT 13.87
5 OTHER 15.79
In [11]:
plt.figure(figsize=(10, 6))
sns.barplot(x='home_ownership', y='default_rate', data=df_4, palette='viridis')
plt.title('Average Default Rate by home_ownership')
plt.xlabel('home_ownership')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
/tmp/ipykernel_3790/2456128957.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='home_ownership', y='default_rate', data=df_4, palette='viridis')
No description has been provided for this image

Stability is key. Borrowers with a MORTGAGE (10.31% default) are significantly safer than those who RENT (13.87%). This 3.5% gap represents a major opportunity for risk-adjusted pricing.

Recommendation: Use homeownership as a "limit booster." Mortgage holders should be eligible for higher credit limits, while Renters should start with lower "test" limits until they prove 12 months of consistent payment history.

Profitability & Interest Rates¶

1. What is the average interest rate charged by loan purpose?¶

In [12]:
conn.rollback()

query_5 = """
SELECT purpose, AVG(int_rate) AS int_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY purpose
ORDER BY int_rate
"""
df_5 = pd.read_sql(query_5, conn)
df_5
/tmp/ipykernel_3790/2700889306.py:9: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_5 = pd.read_sql(query_5, conn)
Out[12]:
purpose int_rate
0 credit_card 11.696088
1 educational 11.958036
2 car 12.181766
3 home_improvement 12.620322
4 major_purchase 12.762865
5 vacation 13.451090
6 debt_consolidation 13.522794
7 medical 13.631922
8 wedding 14.152037
9 other 14.235124
10 house 14.395558
11 renewable_energy 14.727548
12 moving 14.738670
13 small_business 15.263895
In [13]:
plt.figure(figsize=(12, 7))
sns.barplot(x='int_rate', y='purpose', data=df_5.sort_values(by='int_rate', ascending=False), palette='viridis')
plt.title('Average Interest Rate by Loan Purpose')
plt.xlabel('Average Interest Rate (%)')
plt.ylabel('Loan Purpose')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
/tmp/ipykernel_3790/4226815170.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='int_rate', y='purpose', data=df_5.sort_values(by='int_rate', ascending=False), palette='viridis')
No description has been provided for this image

Credit card refinancing carries the lowest rates (11.70%), while Small Business loans carry the highest (15.26%). This reflects the high mortality rate of new ventures compared to stabilizing existing debt.

Recommendation: Aggressively market "Credit Card Refinance" to "Debt Consolidation" borrowers. By shifting them into the lower-rate category with stricter documentation, you can capture higher-quality volume from competitors.

2. What is the total loss from "Charged Off" loans vs. total amount funded?¶

In [14]:
conn.rollback()
query_6 = """
SELECT
  COUNT(id) AS total_loans,
  SUM(funded_amnt) AS total_funded_amnt,
  COUNT(CASE WHEN loan_status = 'Charged Off' THEN id END) AS num_of_loans_loses,
  SUM(CASE WHEN loan_status = 'Charged Off' THEN loan_amnt ELSE 0 END) AS total_loses
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018;

"""

df_6 = pd.read_sql(query_6, conn)
df_6
/tmp/ipykernel_3790/4183277396.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_6 = pd.read_sql(query_6, conn)
Out[14]:
total_loans total_funded_amnt num_of_loans_loses total_loses
0 2259907 3.399717e+10 268559 4.180136e+09
In [15]:
conn.rollback()
query_7 = """
SELECT
  issue_d,
  COUNT(id) AS total_loans,
  SUM(funded_amnt) AS total_funded_amnt,
  COUNT(CASE WHEN loan_status = 'Charged Off' THEN id END) AS num_of_loans_loses,
  SUM(CASE WHEN loan_status = 'Charged Off' THEN loan_amnt ELSE 0 END) AS total_loses
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY issue_d
ORDER BY issue_d;
"""

df_7 = pd.read_sql(query_7, conn)
df_7["issue_d"] = pd.to_datetime(df_7["issue_d"], format="%b-%Y")
df_7["issue_d"] = df_7["issue_d"].dt.strftime('%b-%Y')
df_7
/tmp/ipykernel_3790/3613905381.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_7 = pd.read_sql(query_7, conn)
Out[15]:
issue_d total_loans total_funded_amnt num_of_loans_loses total_loses
0 Apr-2008 223 1580975.0 27 331700.0
1 Apr-2009 324 2981875.0 40 417200.0
2 Apr-2010 893 9311075.0 82 830550.0
3 Apr-2011 1563 17520000.0 222 2877750.0
4 Apr-2012 3230 42051125.0 496 6970325.0
... ... ... ... ... ...
134 Sep-2014 10606 159444175.0 1879 29054400.0
135 Sep-2015 28641 450246800.0 5180 83119550.0
136 Sep-2016 28144 399512400.0 4203 64750825.0
137 Sep-2017 39713 601398225.0 3347 54370925.0
138 Sep-2018 39026 635394775.0 173 2726500.0

139 rows × 5 columns

In [16]:
plt.figure(figsize=(15, 6))
sns.lineplot(x='issue_d', y='total_loses', data=df_7)
plt.title('Total Losses by Issue Date (Month-Year)')
plt.xlabel('Issue Date')
plt.ylabel('Total Losses ($)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
df_7['issue_d'] = pd.to_datetime(df_7['issue_d'], format='%b-%Y')
df_7['loss_rate'] = (df_7['total_loses'] / df_7['total_funded_amnt']) * 100

plt.figure(figsize=(15, 6))
sns.lineplot(x='issue_d', y='loss_rate', data=df_7)
plt.title('Loss Rate by Issue Date (Month-Year)')
plt.xlabel('Issue Date')
plt.ylabel('Loss Rate (%)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image

The portfolio has "Charged Off" 4.18 Billion out of 33.9 Billion funded. Loss rates peaked at 18-19% during the aggressive 2014-2015 expansion phase.

Recommendation: Debt Monetization. The company should establish a secondary pipeline to sell this 4.18B in bad debt to external recovery firms. Even at a 3% recovery price, this would generate 125 Million in immediate cash flow with zero marketing cost.

Customer Profile & Geography¶

1. What is the relationship between employment length and loan amount?¶

In [18]:
conn.rollback()

query_8 = """
SELECT
    emp_length,
    COUNT(id) AS total_loans,
    ROUND(AVG(loan_amnt), 2) AS avg_loan_amount,
    MIN(loan_amnt) AS min_loan_amount,
    MAX(loan_amnt) AS max_loan_amount,
    SUM(loan_amnt) AS total_volume
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
WHERE emp_length IS NOT NULL AND emp_length != 'n/a'
GROUP BY emp_length
ORDER BY
    CASE
        WHEN emp_length = '< 1 year' THEN 0
        WHEN emp_length = '1 year' THEN 1
        WHEN emp_length = '2 years' THEN 2
        WHEN emp_length = '3 years' THEN 3
        WHEN emp_length = '4 years' THEN 4
        WHEN emp_length = '5 years' THEN 5
        WHEN emp_length = '6 years' THEN 6
        WHEN emp_length = '7 years' THEN 7
        WHEN emp_length = '8 years' THEN 8
        WHEN emp_length = '9 years' THEN 9
        WHEN emp_length = '10+ years' THEN 10
        ELSE 11
    END;
"""

df_8 = pd.read_sql(query_8, conn)
df_8
/tmp/ipykernel_3790/2863495014.py:31: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_8 = pd.read_sql(query_8, conn)
Out[18]:
emp_length total_loans avg_loan_amount min_loan_amount max_loan_amount total_volume
0 < 1 year 189878 14823.71 500.0 40000.0 2.814697e+09
1 1 year 148312 14077.03 500.0 40000.0 2.087793e+09
2 2 years 203594 14368.63 500.0 40000.0 2.925367e+09
3 3 years 180681 14499.96 500.0 40000.0 2.619868e+09
4 4 years 136549 14601.01 900.0 40000.0 1.993753e+09
5 5 years 139648 14804.89 1000.0 40000.0 2.067473e+09
6 6 years 102585 14973.37 1000.0 40000.0 1.536043e+09
7 7 years 92663 15148.59 500.0 40000.0 1.403714e+09
8 8 years 91882 15333.95 1000.0 40000.0 1.408914e+09
9 9 years 79374 15414.29 1000.0 40000.0 1.223494e+09
10 10+ years 747849 16243.37 600.0 40000.0 1.214759e+10
In [19]:
plt.figure(figsize=(12, 7))
sns.barplot(x='emp_length', y='avg_loan_amount', data=df_8, palette='viridis')
plt.title('Average Loan Amount by Employment Length')
plt.xlabel('Employment Length')
plt.ylabel('Average Loan Amount ($)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
/tmp/ipykernel_3790/2142009232.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='emp_length', y='avg_loan_amount', data=df_8, palette='viridis')
No description has been provided for this image

Borrowers with 10+ years of employment are the anchor of the business, holding the largest volume ($12.1B) and the highest average loans. Conversely, borrowers with <1 year are receiving loans similar in size to those with 5 years of tenure—a high-risk acquisition strategy.

Recommendation: Launch a "Veteran Loyalty Program." Offer automatic rate reductions or fee waivers for 10+ year employees on their second or third loan to prevent them from churning to traditional banks.

2. Which states carry the highest credit risk?¶

In [20]:
conn.rollback()

query_9 = """
SELECT
	addr_state,
	COUNT(id) AS total_loans,
	ROUND(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 ELSE 0 END) * 100 / COUNT(id), 2) AS default_rate
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY addr_state
ORDER BY default_rate DESC
LIMIT 5
"""

df_9 = pd.read_sql(query_9, conn)
df_9
/tmp/ipykernel_3790/532320389.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_9 = pd.read_sql(query_9, conn)
Out[20]:
addr_state total_loans default_rate
0 AL 27276 14.0
1 AR 17068 14.0
2 MS 12635 13.0
3 LA 25754 13.0
4 NM 11983 13.0
In [21]:
plt.figure(figsize=(10, 6))
sns.barplot(x='addr_state', y='default_rate', data=df_9, palette='magma')
plt.title('Top 5 States by Default Rate')
plt.xlabel('State')
plt.ylabel('Default Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
/tmp/ipykernel_3790/620259226.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='addr_state', y='default_rate', data=df_9, palette='magma')
No description has been provided for this image

Alabama (AL) and Arkansas (AR) lead in credit risk with a 14% default rate. Southern states dominate the top 5 riskiest geographies.

Recommendation: Apply Geographic Risk Premiums. Loans originating in these top 5 states should include a 1-2% interest rate surcharge to offset the higher regional probability of loss.

3. Is there a clear difference in annual income between payers and non-payers?¶

In [22]:
conn.rollback()

query_10 = """
SELECT
	CASE
		WHEN annual_inc <= 35000.00 THEN 'low_salary'
		WHEN annual_inc <= 60000.00 THEN 'mid_low_salary'
		WHEN annual_inc <= 120000.00 THEN 'mid_salary'
		WHEN annual_inc <= 250000.00 THEN 'mid_high_salary'
		WHEN annual_inc <= 500000.00 THEN 'high_salary'
		ELSE 'elite'
	END AS salary_range,
COUNT(CASE WHEN loan_status = 'Charged Off' THEN 1 END) AS unpaid,
COUNT(CASE WHEN loan_status = 'Fully Paid' THEN 1 END) AS paid,
ROUND(SUM(CASE WHEN loan_status = 'Charged Off' THEN 1 END) * 100 / COUNT(*), 2) AS unpaid_porcentage,
ROUND(SUM(CASE WHEN loan_status = 'Fully Paid'THEN 1 END) * 100 / COUNT(*), 2) AS paid_porcentage
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
WHERE loan_status IN ('Charged Off', 'Fully Paid')
GROUP BY salary_range
ORDER BY paid_porcentage;
"""

df_10 = pd.read_sql(query_10, conn)
df_10
/tmp/ipykernel_3790/862983782.py:23: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_10 = pd.read_sql(query_10, conn)
Out[22]:
salary_range unpaid paid unpaid_porcentage paid_porcentage
0 low_salary 38455 121958 23.0 76.0
1 mid_low_salary 99888 357587 21.0 78.0
2 mid_salary 107493 471207 18.0 81.0
3 mid_high_salary 20975 114300 15.0 84.0
4 high_salary 1527 10171 13.0 86.0
5 elite 221 1528 12.0 87.0
In [23]:
df_10_melted = df_10.melt(id_vars=['salary_range'], value_vars=['unpaid_porcentage', 'paid_porcentage'], var_name='loan_status_type', value_name='percentage')

plt.figure(figsize=(12, 7))
sns.barplot(x='salary_range', y='percentage', hue='loan_status_type', data=df_10_melted, palette={'unpaid_porcentage': 'salmon', 'paid_porcentage': 'lightgreen'})
plt.title('Loan Status Percentage by Salary Range')
plt.xlabel('Salary Range')
plt.ylabel('Percentage (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Loan Status', labels=['Unpaid', 'Paid'])
plt.tight_layout()
plt.show()
No description has been provided for this image

Income is a definitive success predictor. There is an 11-percentage-point gap in repayment. The "Low Salary" group defaults at 23%, while the "Elite" group defaults at only 12%.

Recommendation: Zero-Friction Approvals for "Elite" Borrowers. To prevent high-net-worth clients from dropping out of the sales funnel, the approval process for the "Elite" and "High" salary brackets should be fully automated and instant, as their risk profile is the best in the entire portfolio.

Time Series & Trends:¶

1. How has the volume of issued loans evolved year-over-year?¶

In [24]:
conn.rollback()

query_11 = """
SELECT
    RIGHT(issue_d, 4) AS loan_year,
    COUNT(id) AS number_of_loans,
    SUM(loan_amnt) AS total_amount_funded,
    ROUND(AVG(loan_amnt), 2) AS avg_loan_amount
FROM "Lending_Club_Loan_Project".accepted_2007_to_2018
GROUP BY loan_year
ORDER BY loan_year
"""

df_11 = pd.read_sql(query_11, conn)
df_11
/tmp/ipykernel_3790/2543537009.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df_11 = pd.read_sql(query_11, conn)
Out[24]:
loan_year number_of_loans total_amount_funded avg_loan_amount
0 2007 490 3.913650e+06 7987.04
1 2008 2144 1.887440e+07 8803.36
2 2009 5152 5.055665e+07 9813.01
3 2010 12267 1.294226e+08 10550.47
4 2011 21721 2.616838e+08 12047.50
5 2012 53367 7.184110e+08 13461.71
6 2013 134814 1.982765e+09 14707.41
7 2014 235629 3.503840e+09 14870.16
8 2015 421095 6.417608e+09 15240.29
9 2016 434407 6.400570e+09 14734.04
10 2017 443579 6.584957e+09 14845.06
11 2018 495242 7.936263e+09 16025.02
In [25]:
plt.figure(figsize=(14, 7))

# Plot Number of Loans
plt.subplot(2, 1, 1) # 2 rows, 1 column, first plot
sns.lineplot(x='loan_year', y='number_of_loans', data=df_11, marker='o', color='skyblue')
plt.title('Number of Loans Issued Year-over-Year')
plt.xlabel('Year')
plt.ylabel('Number of Loans')
plt.grid(True, linestyle='--', alpha=0.7)

# Plot Total Amount Funded
plt.subplot(2, 1, 2) # 2 rows, 1 column, second plot
sns.lineplot(x='loan_year', y='total_amount_funded', data=df_11, marker='o', color='lightcoral')
plt.title('Total Amount Funded Year-over-Year')
plt.xlabel('Year')
plt.ylabel('Total Funded Amount ($)')
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()
No description has been provided for this image