import numpy as np
import pandas as pd
data = pd.read_csv('teledata_done.csv',sep=';', encoding='iso-8859-1')
data.head()
C:\Users\Bruger\AppData\Local\Temp\ipykernel_14744\4022911079.py:3: DtypeWarning: Columns (16,17,18,19,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.
data = pd.read_csv('teledata_done.csv',sep=';', encoding='iso-8859-1')
| baseid | fornavn | efternavn | adresse | stednavn | postnummer | bynavn | kommunekode | vej | husnr | ... | ssh_anden_hustype | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65536878 | ec107270-e365-41aa-82f7-ac7716e0a06f | 38c49621-2e54-4ae2-90ac-c54a47a657ec | Søbredden 24 | Svogerslev | 4000 | Roskilde | 265 | Søbredden | 24 | ... | 0,25 | 0 | 0 | 0,75 | 0 | 29.0 | 1196.0 | Ejer | Parcelhus | 5.0 |
| 1 | 63503287 | 5f229333-fe30-404d-a8f7-881d082fbd87 | 679d6544-5a01-4cb5-bc73-ccc936f50482 | Jacob Appels Alle 36 | NaN | 2770 | Kastrup | 185 | Jacob Appels Alle | 36 | ... | 0,121212121212121 | 0 | 0,121212121212121 | 0,757575757575758 | 0 | 35.0 | 569.0 | Ejer | Række/kæde/dobbelthus | 5.0 |
| 2 | 62199324 | 1a25f102-40f6-46c5-9ed7-f79259118aca | 0697dcfd-103d-454f-b8c4-f3e22abc6f68 | Højbovænge 8 | NaN | 4660 | Store Heddinge | 336 | Højbovænge | 8 | ... | 0 | 0,111111111111111 | 0,444444444444444 | 0,111111111111111 | 0,333333333333333 | 19.0 | 302.0 | Lejer | Række/kæde/dobbelthus | 3.0 |
| 3 | 64841017 | 348dc61c-209c-4b77-a3fe-9cde68ff65f6 | a18f669c-9bf6-498a-84bf-783a7b93b38d | Engdraget 2 | Gårslev | 7080 | Børkop | 630 | Engdraget | 2 | ... | 0,2 | 0 | 0 | 0,4 | 0,4 | 43.0 | 1747.0 | Ejer | Parcelhus | 7.0 |
| 4 | 10055668 | a64ff2ca-bb9d-46ab-8570-28bd39ebde27 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Irisvej 9 | NaN | 3300 | Frederiksværk | 260 | Irisvej | 9 | ... | 0,333333333333333 | 0 | 0 | 0,333333333333333 | 0,333333333333333 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 |
5 rows × 66 columns
for column in data.columns:
print('Total number of unique entities in ' +str(column)+ ' Column is == ' + str(len(data[column].unique())))
# data[column].unique()
Total number of unique entities in baseid Column is == 2182677 Total number of unique entities in fornavn Column is == 875079 Total number of unique entities in efternavn Column is == 136889 Total number of unique entities in adresse Column is == 1394662 Total number of unique entities in stednavn Column is == 6231 Total number of unique entities in postnummer Column is == 1060 Total number of unique entities in bynavn Column is == 611 Total number of unique entities in kommunekode Column is == 102 Total number of unique entities in vej Column is == 45149 Total number of unique entities in husnr Column is == 988 Total number of unique entities in bogstav Column is == 27 Total number of unique entities in sal Column is == 103 Total number of unique entities in side Column is == 2958 Total number of unique entities in kvhx Column is == 1761391 Total number of unique entities in robinson Column is == 4 Total number of unique entities in tlfnr1 Column is == 2173321 Total number of unique entities in tlfnr2 Column is == 643562 Total number of unique entities in tlfnr3 Column is == 194333 Total number of unique entities in tlfnr4 Column is == 62137 Total number of unique entities in tlfnr5 Column is == 18420 Total number of unique entities in tlftype1 Column is == 3 Total number of unique entities in tlftype2 Column is == 3 Total number of unique entities in tlftype3 Column is == 3 Total number of unique entities in tlftype4 Column is == 3 Total number of unique entities in tlftype5 Column is == 3 Total number of unique entities in koen Column is == 3 Total number of unique entities in alder Column is == 82 Total number of unique entities in mosaic_gruppe Column is == 13 Total number of unique entities in mosaic_type Column is == 45 Total number of unique entities in ssh_0_born Column is == 360 Total number of unique entities in ssh_1_barn Column is == 239 Total number of unique entities in ssh_2_born Column is == 259 Total number of unique entities in ssh_3_plus_born Column is == 432 Total number of unique entities in udd_grundskole Column is == 2464 Total number of unique entities in udd_almen_gymnasial Column is == 2433 Total number of unique entities in udd_erhvervsgymasial Column is == 2 Total number of unique entities in udd_erhvervsfaglig_forloeb Column is == 3019 Total number of unique entities in udd_kort_videregaaende Column is == 1449 Total number of unique entities in udd_mellemlang_videregaaende Column is == 2730 Total number of unique entities in udd_bachelor Column is == 1643 Total number of unique entities in udd_lang_videregaaende Column is == 1319 Total number of unique entities in udd_forsker Column is == 1319 Total number of unique entities in udd_uoplyst Column is == 1236 Total number of unique entities in socio_high_selvst Column is == 782 Total number of unique entities in socio_mellemniveau Column is == 252 Total number of unique entities in socio_grundniveau Column is == 617 Total number of unique entities in socio_ledig_kontant Column is == 338 Total number of unique entities in socio_pensionist Column is == 890 Total number of unique entities in socio_other Column is == 443 Total number of unique entities in civilstand_ugift Column is == 324 Total number of unique entities in civilstand_gift Column is == 317 Total number of unique entities in civilstand_skilt Column is == 252 Total number of unique entities in civilstand_enke Column is == 289 Total number of unique entities in okonomisk_formaaen Column is == 6 Total number of unique entities in antal_beboere Column is == 89 Total number of unique entities in husstandsindkomst Column is == 299818 Total number of unique entities in ssh_anden_hustype Column is == 296 Total number of unique entities in ssh_enlig_m_born Column is == 252 Total number of unique entities in ssh_enlig_u_born Column is == 408 Total number of unique entities in ssh_par_m_born Column is == 344 Total number of unique entities in ssh_par_u_born Column is == 342 Total number of unique entities in donation_ssh Column is == 91 Total number of unique entities in donation_gns Column is == 8429 Total number of unique entities in ejerforhold Column is == 7 Total number of unique entities in enhedsanvendelse Column is == 7 Total number of unique entities in antal vaerelser Column is == 112
Unique Key for each lead in the dataset
The first name for each lead. This column do contain both first and middlename for the lead
The lastname for each lead. This column can contain both middle and lastname for the lead - but will in most cases only be the lastname
The adress of the leads. This cell contains following: Streetname, Housenumber, Letter, Floor, Which side of the floor the lead lives on
Please ask the person. This feature expanation is missing
the zipcode wherein the lead lives. The zipcode is always 4 digits
The name of the city which the address belongs to.
municipalitycode. Numeric identifier if the municipality of the address is situated in. The municipoalitycode herein is only 3 digits.
The streetname of the address
The housenumber in the address
if there is a letter belonging to the housnumber in the address it will be here
If the address has a floor you can see it in here
If there is multiple placements on the floor this column will describe the placement of the leads address on the floor
a numeric code, that describes the address. The code is built this way: K:Municipality code (first 3 digits) V:Streetcode (Next 4 digits) H:Housenumber(Next 4 digits) X:Floor and placement description (Floor):(Next 2 digits) (Placement):(Last digits)
The code is built like this: KKKVVVVHHHHEESSSS
Rules: municipalitycode can only be three digits long Streetcode can only be 4 digits long Housenumber can only be 4 digits long Floor can only be 2 digits long Placement can only be 4 digits long
describes if we are allowed to call the person. 0/blank= there is noone on this address registered on the "robinson-list" E.g. We are allowed to call the person 1= the person is registered on the "robinson-list" E.g. We are not allowed to call the person 2= match on partly the name and the address on the "robinson-list" e.g. we can call the person 3= match on the address on the "robinson-list" e.g. we can call the person
Primary phonenumber
Secondary Phonenumber
Tertiary phonenumber
fourth phonenumber
fifth phonenumber
What type of phone is the primary phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the secondary phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the tertiary phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the fourth phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the fifth phonenumber: TLF=hardline mobil=Cellphone
What gender does the lead presumeably have. M=Male K=Female Blank=Unknown the gender is estimatet based on the firstname
What age does the lead presumeably have. It ios based on the firstname, and a other variables which are unknown to us
Type of Mosaic_group that the leads belong to. Mosaic groups are a segmentet group of people who are believed to have certain age, gender, income, financial, family and educational backgrounds.
Type of Mosaic_type that the leads belong to. Mosaic types are a subgroup of mosaic_group and are a further segmentet group of people who are believed to have certain age, gender, income, financial, family and educational background.
probability of 0 kids in the household - based on clusterdata
probability of 1 kids in the household - based on clusterdata
probability of 2 kids in the household - based on clusterdata
probability of 3 or more kids in the household - based on clusterdata
The probability that the lead has completed bording-school
The probability that the lead have gone through highschool
The probability that the lead have gone through vocational high school
The probability that the lead have gone through a vocational course
The probability that the lead have gone through a short higher education
The probability that the lead have gone through a medium-term higher education
The probability that the lead have gone through a bachelor degree
The probability that the lead have gone through a higher education
The probability that the lead have a PhD
The probability that the lead have gone through an unknown educational background
The probability of the lead have a high or selfemployed occupational background
The probability of the lead have an intermediate occupational background
The probability of the lead have a basic occupational background
The probability of the lead have an unemployed occupational background
The probability of the lead have a pensioner occupational background
The probability of the lead have another occupational background
probability that the lead is not married
Probability that the leads is married
Probability that the lead is divorced
Probability that the lead is a widdow
financial capacity the order is decending where the top is the best: Hvid=White Grøn=Green Grå=Grey Gul=Yellow Rød=Red The lighter the financial capacity is, the better
estimated # of people living on the aforementioned address
average household income in DKK - based on clusterdata
Probability that the household is something other
Probability that the lead is a single parent who has one or more children
Probability that the lead is in a single adult who doesnt have children
Probability that the lead is in a couple who has one or more children
Probability that the lead is in a couple who doesnt have children
Probability for donation to charities - based on clusterdata
probability for donation. How probable is it that the lead will donate to charities - based on clusterdata
how is the house owned
Ejer=owner Lejer=tenant Andel=share Andet=other
How is the house used
Stuehus til landbrug=farmhouse for agriculture parcelhus=house række/kæde/dobbelthus=townhouse/Chainhouse/semi-detached house lejlighed= Apartment sommerhus= Summerhouse/vacationhouse andet=Other
How many rooms does the house/apartment have
data_response = pd.read_csv('Call_logs_done.csv',sep=';',encoding='iso-8859-1')
data_response.head()
| tlfnr | call_ending_reason | date | time | |
|---|---|---|---|---|
| 0 | NaN | NaN | 01-04-2019 | 12:18 |
| 1 | NaN | NaN | 01-04-2019 | 13:33 |
| 2 | NaN | NaN | 02-04-2019 | 09:35 |
| 3 | NaN | NaN | 02-04-2019 | 09:36 |
| 4 | NaN | NaN | 02-04-2019 | 09:41 |
data['call_ending_reason'] = data_response['call_ending_reason']
data.head()
| baseid | fornavn | efternavn | adresse | stednavn | postnummer | bynavn | kommunekode | vej | husnr | ... | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65536878 | ec107270-e365-41aa-82f7-ac7716e0a06f | 38c49621-2e54-4ae2-90ac-c54a47a657ec | Søbredden 24 | Svogerslev | 4000 | Roskilde | 265 | Søbredden | 24 | ... | 0 | 0 | 0,75 | 0 | 29.0 | 1196.0 | Ejer | Parcelhus | 5.0 | NaN |
| 1 | 63503287 | 5f229333-fe30-404d-a8f7-881d082fbd87 | 679d6544-5a01-4cb5-bc73-ccc936f50482 | Jacob Appels Alle 36 | NaN | 2770 | Kastrup | 185 | Jacob Appels Alle | 36 | ... | 0 | 0,121212121212121 | 0,757575757575758 | 0 | 35.0 | 569.0 | Ejer | Række/kæde/dobbelthus | 5.0 | NaN |
| 2 | 62199324 | 1a25f102-40f6-46c5-9ed7-f79259118aca | 0697dcfd-103d-454f-b8c4-f3e22abc6f68 | Højbovænge 8 | NaN | 4660 | Store Heddinge | 336 | Højbovænge | 8 | ... | 0,111111111111111 | 0,444444444444444 | 0,111111111111111 | 0,333333333333333 | 19.0 | 302.0 | Lejer | Række/kæde/dobbelthus | 3.0 | NaN |
| 3 | 64841017 | 348dc61c-209c-4b77-a3fe-9cde68ff65f6 | a18f669c-9bf6-498a-84bf-783a7b93b38d | Engdraget 2 | Gårslev | 7080 | Børkop | 630 | Engdraget | 2 | ... | 0 | 0 | 0,4 | 0,4 | 43.0 | 1747.0 | Ejer | Parcelhus | 7.0 | NaN |
| 4 | 10055668 | a64ff2ca-bb9d-46ab-8570-28bd39ebde27 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Irisvej 9 | NaN | 3300 | Frederiksværk | 260 | Irisvej | 9 | ... | 0 | 0 | 0,333333333333333 | 0,333333333333333 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | NaN |
5 rows × 67 columns
The only data that is useful to us is that which have a known and useful output. The calls that are not answered are not useful for us at all
filtered_data = data[data['call_ending_reason'].notnull()]
filtered_data.head(11)
| baseid | fornavn | efternavn | adresse | stednavn | postnummer | bynavn | kommunekode | vej | husnr | ... | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 51934887 | fd0a8f24-2457-48b4-a7fe-fe0b95a3a81e | ba3df7f7-4943-4c68-b0bc-f606069e6d05 | Teglholt 12 | NaN | 6200 | Aabenraa | 580 | Teglholt | 12 | ... | 0 | 0,43 | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
| 22 | 11515649 | 15a1f0f6-b6e2-464a-8296-960659185184 | b3a586b3-4495-4ca5-a7ed-464508df504d | Sundvænget 32 | Dybbøl | 6400 | Sønderborg | 540 | Sundvænget | 32 | ... | 0 | 0,141414141414141 | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail |
| 83 | 12310066 | 4a679ecd-e896-4aad-b369-45f0164dc9e7 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Spøttrupvej 42, st tv | Tjørring | 7400 | Herning | 657 | Spøttrupvej | 42 | ... | 0 | 0,25 | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse |
| 645 | 30029973 | 36db5eaa-2531-453e-9bb8-3a5b47a9a029 | d196005f-0729-4222-9761-531ecc10c72d | Hjortøvænget 12 | Skærbæk | 7000 | Fredericia | 607 | Hjortøvænget | 12 | ... | 0,333333333333333 | 0 | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse |
| 646 | 54059038 | e9c577d1-417e-4584-ad74-232b3f7dd06c | df2cd3d3-f172-4a18-b64a-6dcc070b9aa4 | Engvænget 32 | NaN | 2650 | Hvidovre | 167 | Engvænget | 32 | ... | 0,09 | 0 | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
| 649 | 61722285 | 1a2c9fde-ac13-46ee-8723-f27534ef14dd | af3b5f8c-5ce9-41af-a09b-8b463596dec1 | Nannasvej 10 | NaN | 6100 | Haderslev | 510 | Nannasvej | 10 | ... | 0 | 0 | 0,2 | 0,8 | 25.0 | 401.0 | Ejer | Parcelhus | 6.0 | Røde Kors - Sikkerhedsmail |
| 661 | 63420003 | c10cc775-7071-4ce1-882f-00bb7add40b6 | 8b90aa84-a58e-4af9-8c25-c9d1324b372b | Kløvenhøj 24 | Ølby | 7600 | Struer | 671 | Kløvenhøj | 24 | ... | 0 | 0,2 | 0,4 | 0,4 | 0.0 | 0.0 | Ejer | Parcelhus | 5.0 | Børns vilkår - Sikkerhedsmail |
| 662 | 56894951 | 2d942692-9654-461f-9cd1-4c2f5aa70722 | 0697dcfd-103d-454f-b8c4-f3e22abc6f68 | Abelonelundvej 4 | Strib | 5500 | Middelfart | 410 | Abelonelundvej | 4 | ... | 0,11 | 0,11 | 0 | 0,78 | 0.0 | 0.0 | Ejer | Parcelhus | 6.0 | Børns vilkår - Sikkerhedsmail |
| 667 | 57749394 | c1980451-c102-49ee-998d-f8310b130836 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Borkvej 7 | No | 6950 | Ringkøbing | 760 | Borkvej | 7 | ... | 0 | 0,4 | 0 | 0,4 | 0.0 | 0.0 | Ejer | Stuehus til landbrug | 4.0 | Røde Kors - Sikkerhedsmail |
| 668 | 63082476 | 28fd697d-28e8-4f90-b189-85f7ab464dac | e6afb89e-e441-41da-8276-ac49b7d88206 | Slimmingevej 27 | NaN | 4100 | Ringsted | 259 | Slimmingevej | 27 | ... | 0 | 0 | 0,2 | 0,6 | 0.0 | 0.0 | Ejer | Stuehus til landbrug | 8.0 | Røde Kors - Sikkerhedsmail |
| 669 | 63217826 | 015d905c-07a5-45c1-bb3a-6dbfa91b607f | 64701ffb-883b-4ac0-85f4-e83ee973c82e | Svalevej 5 | Horbelev | 4871 | Horbelev | 376 | Svalevej | 5 | ... | 0,0808080808080808 | 0,585858585858586 | 0 | 0,333333333333333 | NaN | NaN | Selskab | Række/kæde/dobbelthus | 4.0 | Børns vilkår - Sikkerhedsmail |
11 rows × 67 columns
del filtered_data['baseid']
del filtered_data['fornavn']
del filtered_data['efternavn']
del filtered_data['adresse']
del filtered_data['vej']
del filtered_data['husnr']
del filtered_data['bogstav']
del filtered_data['sal']
del filtered_data['side']
del filtered_data['kvhx']
del filtered_data['tlfnr1']
del filtered_data['tlfnr2']
del filtered_data['tlfnr3']
del filtered_data['tlfnr4']
del filtered_data['tlfnr5']
filtered_data.head()
| stednavn | postnummer | bynavn | kommunekode | robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | ... | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | NaN | 6200 | Aabenraa | 580 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,43 | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
| 22 | Dybbøl | 6400 | Sønderborg | 540 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,141414141414141 | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail |
| 83 | Tjørring | 7400 | Herning | 657 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,25 | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse |
| 645 | Skærbæk | 7000 | Fredericia | 607 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,333333333333333 | 0 | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse |
| 646 | NaN | 2650 | Hvidovre | 167 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,09 | 0 | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
5 rows × 52 columns
import pgeocode
# We are targetiing only denmark postal codes
nomi = pgeocode.Nominatim('dk')
def convert_post_to_lat(post_number):
data_frame = nomi.query_postal_code(str(post_number))
return data_frame['latitude']
def convert_post_to_long(post_number):
data_frame = nomi.query_postal_code(str(post_number))
return data_frame['longitude']
post_numbers = filtered_data['postnummer'].tolist()
post_numbers = map(str, post_numbers)
query = nomi.query_postal_code(post_numbers)
filtered_data['latitude'] = query['latitude']
filtered_data['longitude'] = query['longitude']
C:\Users\Bruger\AppData\Local\Temp\ipykernel_14744\3412094935.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data['latitude'] = query['latitude'] C:\Users\Bruger\AppData\Local\Temp\ipykernel_14744\3412094935.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data['longitude'] = query['longitude']
import pickle
filtered_data.to_pickle('filtered.zip')
filtered_data.head()
| stednavn | postnummer | bynavn | kommunekode | robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | ... | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | NaN | 6200 | Aabenraa | 580 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.4442 | 11.8065 |
| 22 | Dybbøl | 6400 | Sønderborg | 540 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail | 55.6602 | 11.4104 |
| 83 | Tjørring | 7400 | Herning | 657 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse | 56.1772 | 9.7805 |
| 645 | Skærbæk | 7000 | Fredericia | 607 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse | 56.0337 | 12.5881 |
| 646 | NaN | 2650 | Hvidovre | 167 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.9281 | 11.6393 |
5 rows × 54 columns
stednavn
postnummer
bynavn
del filtered_data['stednavn']
del filtered_data['postnummer']
del filtered_data['bynavn']
del filtered_data['kommunekode']
filtered_data.head()
| robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | koen | alder | mosaic_gruppe | mosaic_type | ... | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 76.0 | J | J35 | ... | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.4442 | 11.8065 |
| 22 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 61.0 | B | B03 | ... | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail | 55.6602 | 11.4104 |
| 83 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 74.0 | L | L44 | ... | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse | 56.1772 | 9.7805 |
| 645 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 39.0 | F | F20 | ... | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse | 56.0337 | 12.5881 |
| 646 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 66.0 | B | B05 | ... | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.9281 | 11.6393 |
5 rows × 50 columns
filtered_data = filtered_data.reset_index(drop=True)
filtered_data.head()
| robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | koen | alder | mosaic_gruppe | mosaic_type | ... | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 76.0 | J | J35 | ... | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.4442 | 11.8065 |
| 1 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 61.0 | B | B03 | ... | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail | 55.6602 | 11.4104 |
| 2 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 74.0 | L | L44 | ... | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse | 56.1772 | 9.7805 |
| 3 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 39.0 | F | F20 | ... | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse | 56.0337 | 12.5881 |
| 4 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 66.0 | B | B05 | ... | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.9281 | 11.6393 |
5 rows × 50 columns
for column in filtered_data.columns:
print('Total number of unique entities in ' +str(column)+ ' Column is == ' + str(len(filtered_data[column].unique())))
Total number of unique entities in robinson Column is == 4 Total number of unique entities in tlftype1 Column is == 3 Total number of unique entities in tlftype2 Column is == 3 Total number of unique entities in tlftype3 Column is == 3 Total number of unique entities in tlftype4 Column is == 3 Total number of unique entities in tlftype5 Column is == 3 Total number of unique entities in koen Column is == 3 Total number of unique entities in alder Column is == 81 Total number of unique entities in mosaic_gruppe Column is == 13 Total number of unique entities in mosaic_type Column is == 45 Total number of unique entities in ssh_0_born Column is == 348 Total number of unique entities in ssh_1_barn Column is == 231 Total number of unique entities in ssh_2_born Column is == 252 Total number of unique entities in ssh_3_plus_born Column is == 408 Total number of unique entities in udd_grundskole Column is == 2336 Total number of unique entities in udd_almen_gymnasial Column is == 2284 Total number of unique entities in udd_erhvervsgymasial Column is == 2 Total number of unique entities in udd_erhvervsfaglig_forloeb Column is == 2856 Total number of unique entities in udd_kort_videregaaende Column is == 1365 Total number of unique entities in udd_mellemlang_videregaaende Column is == 2533 Total number of unique entities in udd_bachelor Column is == 1536 Total number of unique entities in udd_lang_videregaaende Column is == 1231 Total number of unique entities in udd_forsker Column is == 1231 Total number of unique entities in udd_uoplyst Column is == 1141 Total number of unique entities in socio_high_selvst Column is == 782 Total number of unique entities in socio_mellemniveau Column is == 252 Total number of unique entities in socio_grundniveau Column is == 617 Total number of unique entities in socio_ledig_kontant Column is == 338 Total number of unique entities in socio_pensionist Column is == 890 Total number of unique entities in socio_other Column is == 443 Total number of unique entities in civilstand_ugift Column is == 323 Total number of unique entities in civilstand_gift Column is == 309 Total number of unique entities in civilstand_skilt Column is == 249 Total number of unique entities in civilstand_enke Column is == 285 Total number of unique entities in okonomisk_formaaen Column is == 6 Total number of unique entities in antal_beboere Column is == 84 Total number of unique entities in husstandsindkomst Column is == 285645 Total number of unique entities in ssh_anden_hustype Column is == 280 Total number of unique entities in ssh_enlig_m_born Column is == 245 Total number of unique entities in ssh_enlig_u_born Column is == 389 Total number of unique entities in ssh_par_m_born Column is == 333 Total number of unique entities in ssh_par_u_born Column is == 333 Total number of unique entities in donation_ssh Column is == 91 Total number of unique entities in donation_gns Column is == 7769 Total number of unique entities in ejerforhold Column is == 7 Total number of unique entities in enhedsanvendelse Column is == 7 Total number of unique entities in antal vaerelser Column is == 104 Total number of unique entities in call_ending_reason Column is == 68 Total number of unique entities in latitude Column is == 785 Total number of unique entities in longitude Column is == 913
print(filtered_data['udd_erhvervsgymasial'].unique())
[ 0. nan]
# Removing this column as all the information inside is useless for us it only containes 0 an nan for us
del filtered_data['udd_erhvervsgymasial']
One hot encoding of the features that are continous i.e. probbaiities is never made
columns_list = ['tlftype1', 'tlftype2', 'tlftype3','tlftype4','tlftype5','koen','mosaic_gruppe','mosaic_type','ejerforhold','enhedsanvendelse','okonomisk_formaaen']
for column in columns_list:
one_hot = pd.get_dummies(filtered_data[str(column)],prefix=column,prefix_sep='_')
# Drop column B as it is now encoded
del filtered_data[str(column)]
# Join the encoded df
filtered_data = filtered_data.join(one_hot)
filtered_data.head()
| robinson | alder | ssh_0_born | ssh_1_barn | ssh_2_born | ssh_3_plus_born | udd_grundskole | udd_almen_gymnasial | udd_erhvervsfaglig_forloeb | udd_kort_videregaaende | ... | enhedsanvendelse_Lejlighed | enhedsanvendelse_Parcelhus | enhedsanvendelse_Række/kæde/dobbelthus | enhedsanvendelse_Sommerhus | enhedsanvendelse_Stuehus til landbrug | okonomisk_formaaen_1. HVID | okonomisk_formaaen_2. GRØN | okonomisk_formaaen_3. GRÅ | okonomisk_formaaen_4. GUL | okonomisk_formaaen_5. RØD | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 76.0 | 1 | 0 | 0 | 0 | 0 | 0,0909090909090909 | 0,272727272727273 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 1 | 61.0 | 0,75 | 0,25 | 0 | 0 | 0,12280701754386 | 0 | 0,254385964912281 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 0 | 74.0 | 1 | 0 | 0 | 0 | 0,2 | 0 | 0,6 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 39.0 | 0,4 | 0,4 | 0,2 | 0 | 0 | 0 | 0 | 0,166666666666667 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 1 | 66.0 | 0,73 | 0 | 0,18 | 0,09 | 0 | 0 | 0,28448275862069 | 0,0689655172413793 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 123 columns
cols = ['alder','ssh_0_born','ssh_1_barn','ssh_2_born','ssh_3_plus_born','udd_grundskole','udd_almen_gymnasial','udd_erhvervsfaglig_forloeb','udd_kort_videregaaende','udd_mellemlang_videregaaende','udd_bachelor','udd_lang_videregaaende','udd_forsker','udd_uoplyst','socio_high_selvst','socio_mellemniveau','socio_grundniveau','socio_ledig_kontant','socio_pensionist','socio_other','civilstand_ugift','civilstand_gift','civilstand_skilt','civilstand_enke','antal_beboere','husstandsindkomst','ssh_anden_hustype','ssh_enlig_m_born','ssh_enlig_u_born','ssh_par_m_born','ssh_par_u_born','donation_ssh','donation_gns','antal vaerelser']
for col in cols:
try:
filtered_data[col] = filtered_data[col].str.replace(',','.')
except:
print(col)
alder antal_beboere husstandsindkomst donation_ssh donation_gns antal vaerelser
filtered_data.head()
| robinson | alder | ssh_0_born | ssh_1_barn | ssh_2_born | ssh_3_plus_born | udd_grundskole | udd_almen_gymnasial | udd_erhvervsfaglig_forloeb | udd_kort_videregaaende | ... | enhedsanvendelse_Lejlighed | enhedsanvendelse_Parcelhus | enhedsanvendelse_Række/kæde/dobbelthus | enhedsanvendelse_Sommerhus | enhedsanvendelse_Stuehus til landbrug | okonomisk_formaaen_1. HVID | okonomisk_formaaen_2. GRØN | okonomisk_formaaen_3. GRÅ | okonomisk_formaaen_4. GUL | okonomisk_formaaen_5. RØD | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 76.0 | 1 | 0 | 0 | 0 | 0 | 0.0909090909090909 | 0.272727272727273 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 1 | 61.0 | 0.75 | 0.25 | 0 | 0 | 0.12280701754386 | 0 | 0.254385964912281 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 0 | 74.0 | 1 | 0 | 0 | 0 | 0.2 | 0 | 0.6 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 39.0 | 0.4 | 0.4 | 0.2 | 0 | 0 | 0 | 0 | 0.166666666666667 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 1 | 66.0 | 0.73 | 0 | 0.18 | 0.09 | 0 | 0 | 0.28448275862069 | 0.0689655172413793 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 123 columns
Exploding gradients are a problem where large error gradients accumulate and result in very large updates to neural network model weights during training.
filtered_data['husstandsindkomst']=(filtered_data['husstandsindkomst']-filtered_data['husstandsindkomst'].min())/(filtered_data['husstandsindkomst'].max()-filtered_data['husstandsindkomst'].min())
import pandas as pd
mapping_file = pd.read_csv('map.csv',encoding='iso-8859-1')
mapping_file.head()
| call_ending_reason | meaning | Yes/no/unfinished | |
|---|---|---|---|
| 0 | Ingen Interesse | No I will not donate | No |
| 1 | Vil Ikke Kontaktes | Do not want to be contacted again | No |
| 2 | FORKERT NUMMER | Wrong number/said person do not exist on this ... | No |
| 3 | Ugyldigt nummer (anvendes n?r nummer er forkert) | Wrong number/said person do not exist on this ... | No |
| 4 | Vil ikke udlevere BS-oplysninger | Do not want to give out direct debit informations | No |
def convert_to_int(x):
if x == 'No':
return 0
elif x == 'Yes':
return 1
elif x =='Unfinished':
return 2
mapping_file['Yes/no/unfinished'] = mapping_file['Yes/no/unfinished'].apply(lambda x : convert_to_int(x) )
mapping_file.head()
| call_ending_reason | meaning | Yes/no/unfinished | |
|---|---|---|---|
| 0 | Ingen Interesse | No I will not donate | 0 |
| 1 | Vil Ikke Kontaktes | Do not want to be contacted again | 0 |
| 2 | FORKERT NUMMER | Wrong number/said person do not exist on this ... | 0 |
| 3 | Ugyldigt nummer (anvendes n?r nummer er forkert) | Wrong number/said person do not exist on this ... | 0 |
| 4 | Vil ikke udlevere BS-oplysninger | Do not want to give out direct debit informations | 0 |
dictionary = {}
for string,integer in zip(mapping_file['call_ending_reason'],mapping_file['Yes/no/unfinished']):
dictionary[string] = integer
filtered_data['call_ending_reason'] = filtered_data['call_ending_reason'].map(dictionary)
x = filtered_data
y = filtered_data['call_ending_reason']
del x['call_ending_reason']
x = x.fillna(-1)
x.isnull().values.any()
y = y.fillna(0) #Empty reason should be zero
y.isnull().values.any()
False
# x.to_csv('x.csv', encoding='iso-8859-1')
# y.to_csv('y.csv', encoding='iso-8859-1')
# import numpy as np
# def clean_dataset(df):
# assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
# df.dropna(inplace=True)
# indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
# return df[indices_to_keep].astype(np.float64)
With my experience the fancy TSNE visualization for the analysis of the data can be sometimes deceieving mostly in real data problems. KNN also works by exploiting the relationship between the nieghbour. If the KNN performs good at some extent the features are represented in the higher dimesnional space that distinguishness can be made among them
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier()
# rfc.fit(x, y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.75)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='invscaling',learning_rate_init=0.0001, n_iter_no_change=50, max_iter=1000)
rfc.fit(X_train, y_train)
Iteration 1, loss = 0.64777083 Iteration 2, loss = 0.61998823 Iteration 3, loss = 0.60810727 Iteration 4, loss = 0.61431193 Iteration 5, loss = 0.61331521 Iteration 6, loss = 0.60753339 Iteration 7, loss = 0.60393861 Iteration 8, loss = 0.60279712 Iteration 9, loss = 0.60324869 Iteration 10, loss = 0.60323766 Iteration 11, loss = 0.59907949 Iteration 12, loss = 0.59671948 Iteration 13, loss = 0.59781176 Iteration 14, loss = 0.59457808 Iteration 15, loss = 0.59430114 Iteration 16, loss = 0.59218180 Iteration 17, loss = 0.58931528 Iteration 18, loss = 0.59039334 Iteration 19, loss = 0.58869816 Iteration 20, loss = 0.58951301 Iteration 21, loss = 0.58711687 Iteration 22, loss = 0.58615668 Iteration 23, loss = 0.58588281 Iteration 24, loss = 0.58521278 Iteration 25, loss = 0.58494883 Iteration 26, loss = 0.58354615 Iteration 27, loss = 0.58244103 Iteration 28, loss = 0.58245776 Iteration 29, loss = 0.58251832 Iteration 30, loss = 0.58112408 Iteration 31, loss = 0.58225162 Iteration 32, loss = 0.58080646 Iteration 33, loss = 0.57989833 Iteration 34, loss = 0.58019845 Iteration 35, loss = 0.57842373 Iteration 36, loss = 0.57854850 Iteration 37, loss = 0.57762902 Iteration 38, loss = 0.57767142 Iteration 39, loss = 0.57775166 Iteration 40, loss = 0.57662654 Iteration 41, loss = 0.57673124 Iteration 42, loss = 0.57646452 Iteration 43, loss = 0.57658903 Iteration 44, loss = 0.57624564 Iteration 45, loss = 0.57610325 Iteration 46, loss = 0.57599913 Iteration 47, loss = 0.57561941 Iteration 48, loss = 0.57553930 Iteration 49, loss = 0.57554522 Iteration 50, loss = 0.57534226 Iteration 51, loss = 0.57525283 Iteration 52, loss = 0.57519467 Iteration 53, loss = 0.57510471 Iteration 54, loss = 0.57510156 Iteration 55, loss = 0.57499211 Iteration 56, loss = 0.57496538 Iteration 57, loss = 0.57493708 Iteration 58, loss = 0.57495190 Iteration 59, loss = 0.57488816 Iteration 60, loss = 0.57488638 Iteration 61, loss = 0.57479721 Iteration 62, loss = 0.57473223 Iteration 63, loss = 0.57475108 Iteration 64, loss = 0.57467829 Iteration 65, loss = 0.57470547 Iteration 66, loss = 0.57458585 Iteration 67, loss = 0.57466983 Iteration 68, loss = 0.57460967 Iteration 69, loss = 0.57462100 Iteration 70, loss = 0.57451775 Iteration 71, loss = 0.57443434 Iteration 72, loss = 0.57446263 Iteration 73, loss = 0.57445124 Iteration 74, loss = 0.57439550 Iteration 75, loss = 0.57438002 Iteration 76, loss = 0.57430134 Iteration 77, loss = 0.57432144 Iteration 78, loss = 0.57429302 Iteration 79, loss = 0.57431137 Iteration 80, loss = 0.57428483 Iteration 81, loss = 0.57421079 Iteration 82, loss = 0.57417083 Iteration 83, loss = 0.57413732 Iteration 84, loss = 0.57408192 Iteration 85, loss = 0.57407828 Iteration 86, loss = 0.57405153 Iteration 87, loss = 0.57403162 Iteration 88, loss = 0.57399044 Iteration 89, loss = 0.57395314 Iteration 90, loss = 0.57394180 Iteration 91, loss = 0.57392033 Iteration 92, loss = 0.57387986 Iteration 93, loss = 0.57384104 Iteration 94, loss = 0.57378687 Iteration 95, loss = 0.57375078 Iteration 96, loss = 0.57373843 Iteration 97, loss = 0.57370981 Iteration 98, loss = 0.57367616 Iteration 99, loss = 0.57365536 Iteration 100, loss = 0.57362819 Iteration 101, loss = 0.57357638 Iteration 102, loss = 0.57357717 Iteration 103, loss = 0.57352019 Iteration 104, loss = 0.57352027 Iteration 105, loss = 0.57347332 Iteration 106, loss = 0.57340626 Training loss did not improve more than tol=0.000100 for 50 consecutive epochs. Stopping.
MLPClassifier(hidden_layer_sizes=(100, 50, 10), learning_rate='invscaling',
learning_rate_init=0.0001, max_iter=1000, n_iter_no_change=50,
verbose=True)
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))
# from sklearn.metrics import classification_report
# print(classification_report(y[5000:20000], rfc.predict(x[5000:20000])))
precision recall f1-score support
0.0 0.12 0.00 0.00 57358
1.0 0.00 0.00 0.00 6881
2.0 0.80 1.00 0.89 251386
accuracy 0.80 315625
macro avg 0.31 0.33 0.30 315625
weighted avg 0.66 0.80 0.71 315625
x.head()
| robinson | alder | ssh_0_born | ssh_1_barn | ssh_2_born | ssh_3_plus_born | udd_grundskole | udd_almen_gymnasial | udd_erhvervsfaglig_forloeb | udd_kort_videregaaende | ... | enhedsanvendelse_Lejlighed | enhedsanvendelse_Parcelhus | enhedsanvendelse_Række/kæde/dobbelthus | enhedsanvendelse_Sommerhus | enhedsanvendelse_Stuehus til landbrug | okonomisk_formaaen_1. HVID | okonomisk_formaaen_2. GRØN | okonomisk_formaaen_3. GRÅ | okonomisk_formaaen_4. GUL | okonomisk_formaaen_5. RØD | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 76.0 | 1 | 0 | 0 | 0 | 0 | 0.0909090909090909 | 0.272727272727273 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 1 | 61.0 | 0.75 | 0.25 | 0 | 0 | 0.12280701754386 | 0 | 0.254385964912281 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 0 | 74.0 | 1 | 0 | 0 | 0 | 0.2 | 0 | 0.6 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 39.0 | 0.4 | 0.4 | 0.2 | 0 | 0 | 0 | 0 | 0.166666666666667 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 1 | 66.0 | 0.73 | 0 | 0.18 | 0.09 | 0 | 0 | 0.28448275862069 | 0.0689655172413793 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 122 columns
y.value_counts()
2.0 1005293 0.0 229547 1.0 27659 Name: call_ending_reason, dtype: int64
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
label = ['No', 'Yes', 'Unfinished']
number = [229106,27659,1005734]
ax.bar(label,number)
plt.show()
An imbalanced classification problem is an example of a classification problem where the distribution of examples across the known classes is biased or skewed. ... Many real-world classification problems have an imbalanced class distribution, such as fraud detection, spam detection, and churn prediction.
We are going try to downsample the majority class
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
df_balanced, balanced_labels = rus.fit_resample(x, y)
balanced_labels.value_counts()
0.0 27659 1.0 27659 2.0 27659 Name: call_ending_reason, dtype: int64
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
label = ['No', 'Yes', 'Unfished']
number = [27659,27659,27659]
ax.bar(label,number)
plt.show()
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(df_balanced, balanced_labels)
from sklearn.metrics import classification_report
print(classification_report(balanced_labels, rfc.predict(df_balanced)))
precision recall f1-score support
0.0 1.00 1.00 1.00 27659
1.0 1.00 1.00 1.00 27659
2.0 1.00 1.00 1.00 27659
accuracy 1.00 82977
macro avg 1.00 1.00 1.00 82977
weighted avg 1.00 1.00 1.00 82977
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.99)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='constant',learning_rate_init=0.0005, n_iter_no_change=50, max_iter=1000)
# rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))
Iteration 1, loss = 1.32958292
Iteration 2, loss = 1.17382379
Iteration 3, loss = 1.17798205
Iteration 4, loss = 1.16973133
Iteration 5, loss = 1.16275242
Iteration 6, loss = 1.14890057
Iteration 7, loss = 1.14993119
Iteration 8, loss = 1.13621909
Iteration 9, loss = 1.15393737
Iteration 10, loss = 1.12756345
Iteration 11, loss = 1.13597190
Iteration 12, loss = 1.10771288
Iteration 13, loss = 1.11449666
Iteration 14, loss = 1.10836334
Iteration 15, loss = 1.10619507
Iteration 16, loss = 1.10711457
Iteration 17, loss = 1.11136752
Iteration 18, loss = 1.10664509
Iteration 19, loss = 1.10600425
Iteration 20, loss = 1.10593425
Iteration 21, loss = 1.10258331
Iteration 22, loss = 1.10134637
Iteration 23, loss = 1.10732109
Iteration 24, loss = 1.09940176
Iteration 25, loss = 1.10047125
Iteration 26, loss = 1.10109537
Iteration 27, loss = 1.09928363
Iteration 28, loss = 1.10029805
Iteration 29, loss = 1.09909523
Iteration 30, loss = 1.10152546
Iteration 31, loss = 1.09757330
Iteration 32, loss = 1.09762873
Iteration 33, loss = 1.09648323
Iteration 34, loss = 1.09662355
Iteration 35, loss = 1.09694548
Iteration 36, loss = 1.09673984
Iteration 37, loss = 1.09683113
Iteration 38, loss = 1.09630211
Iteration 39, loss = 1.09595167
Iteration 40, loss = 1.09571349
Iteration 41, loss = 1.09546417
Iteration 42, loss = 1.09587294
Iteration 43, loss = 1.09555470
Iteration 44, loss = 1.09488909
Iteration 45, loss = 1.09482741
Iteration 46, loss = 1.09468202
Iteration 47, loss = 1.09473918
Iteration 48, loss = 1.09442563
Iteration 49, loss = 1.09419259
Iteration 50, loss = 1.09404434
Iteration 51, loss = 1.09387755
Iteration 52, loss = 1.09365237
Iteration 53, loss = 1.09344749
Iteration 54, loss = 1.09323214
Iteration 55, loss = 1.09300811
Iteration 56, loss = 1.09275027
Iteration 57, loss = 1.09262124
Iteration 58, loss = 1.09228070
Iteration 59, loss = 1.09187742
Iteration 60, loss = 1.09154762
Iteration 61, loss = 1.09135797
Iteration 62, loss = 1.09110705
Iteration 63, loss = 1.09098558
Iteration 64, loss = 1.09036456
Iteration 65, loss = 1.09035000
Iteration 66, loss = 1.08988133
Iteration 67, loss = 1.08957648
Iteration 68, loss = 1.08954219
Iteration 69, loss = 1.08857836
Iteration 70, loss = 1.08859061
Iteration 71, loss = 1.08829504
Iteration 72, loss = 1.08776913
Iteration 73, loss = 1.08747694
Iteration 74, loss = 1.08697770
Iteration 75, loss = 1.08680607
Iteration 76, loss = 1.08634044
Iteration 77, loss = 1.08565296
Iteration 78, loss = 1.08599048
Iteration 79, loss = 1.08563575
Iteration 80, loss = 1.08547957
Iteration 81, loss = 1.08480080
Iteration 82, loss = 1.08390291
Iteration 83, loss = 1.08351013
Iteration 84, loss = 1.08357633
Iteration 85, loss = 1.08286934
Iteration 86, loss = 1.08218604
Iteration 87, loss = 1.08181367
Iteration 88, loss = 1.08200830
Iteration 89, loss = 1.08133112
Iteration 90, loss = 1.08116489
Iteration 91, loss = 1.08061552
Iteration 92, loss = 1.07998952
Iteration 93, loss = 1.08079146
Iteration 94, loss = 1.07996682
Iteration 95, loss = 1.07870790
Iteration 96, loss = 1.07778636
Iteration 97, loss = 1.07790600
Iteration 98, loss = 1.07721476
Iteration 99, loss = 1.07660069
Iteration 100, loss = 1.07624284
Iteration 101, loss = 1.07654407
Iteration 102, loss = 1.07573565
Iteration 103, loss = 1.07521014
Iteration 104, loss = 1.07492561
Iteration 105, loss = 1.07399228
Iteration 106, loss = 1.07352169
Iteration 107, loss = 1.07347454
Iteration 108, loss = 1.07250182
Iteration 109, loss = 1.07181892
Iteration 110, loss = 1.07254877
Iteration 111, loss = 1.07124297
Iteration 112, loss = 1.07075382
Iteration 113, loss = 1.06995389
Iteration 114, loss = 1.06978551
Iteration 115, loss = 1.06912700
Iteration 116, loss = 1.06859849
Iteration 117, loss = 1.06816041
Iteration 118, loss = 1.06829347
Iteration 119, loss = 1.06718457
Iteration 120, loss = 1.06697756
Iteration 121, loss = 1.06723005
Iteration 122, loss = 1.06637848
Iteration 123, loss = 1.06606195
Iteration 124, loss = 1.06514628
Iteration 125, loss = 1.06414802
Iteration 126, loss = 1.06431648
Iteration 127, loss = 1.06409033
Iteration 128, loss = 1.06325656
Iteration 129, loss = 1.06297425
Iteration 130, loss = 1.06224040
Iteration 131, loss = 1.06168101
Iteration 132, loss = 1.06123806
Iteration 133, loss = 1.06104693
Iteration 134, loss = 1.06027743
Iteration 135, loss = 1.05993886
Iteration 136, loss = 1.05973855
Iteration 137, loss = 1.05878706
Iteration 138, loss = 1.05867528
Iteration 139, loss = 1.05804531
Iteration 140, loss = 1.05744906
Iteration 141, loss = 1.05691735
Iteration 142, loss = 1.05664131
Iteration 143, loss = 1.05577439
Iteration 144, loss = 1.05562281
Iteration 145, loss = 1.05543808
Iteration 146, loss = 1.05416183
Iteration 147, loss = 1.05462832
Iteration 148, loss = 1.05387215
Iteration 149, loss = 1.05297521
Iteration 150, loss = 1.05314814
Iteration 151, loss = 1.05103858
Iteration 152, loss = 1.05247854
Iteration 153, loss = 1.05133800
Iteration 154, loss = 1.05075880
Iteration 155, loss = 1.05031957
Iteration 156, loss = 1.04992163
Iteration 157, loss = 1.04930741
Iteration 158, loss = 1.04923314
Iteration 159, loss = 1.04835843
Iteration 160, loss = 1.04803262
Iteration 161, loss = 1.04810848
Iteration 162, loss = 1.04806761
Iteration 163, loss = 1.04712539
Iteration 164, loss = 1.04625580
Iteration 165, loss = 1.04681005
Iteration 166, loss = 1.04690796
Iteration 167, loss = 1.04533942
Iteration 168, loss = 1.04540972
Iteration 169, loss = 1.04426914
Iteration 170, loss = 1.04399191
Iteration 171, loss = 1.04484498
Iteration 172, loss = 1.04585781
Iteration 173, loss = 1.04395345
Iteration 174, loss = 1.04301116
Iteration 175, loss = 1.04160483
Iteration 176, loss = 1.04093849
Iteration 177, loss = 1.04111133
Iteration 178, loss = 1.04023517
Iteration 179, loss = 1.04072626
Iteration 180, loss = 1.04076880
Iteration 181, loss = 1.03976513
Iteration 182, loss = 1.03919063
Iteration 183, loss = 1.03980694
Iteration 184, loss = 1.03805745
Iteration 185, loss = 1.03775399
Iteration 186, loss = 1.03793278
Iteration 187, loss = 1.03809177
Iteration 188, loss = 1.03801989
Iteration 189, loss = 1.03712718
Iteration 190, loss = 1.03699491
Iteration 191, loss = 1.03595260
Iteration 192, loss = 1.03542557
Iteration 193, loss = 1.03665553
Iteration 194, loss = 1.03482278
Iteration 195, loss = 1.03473248
Iteration 196, loss = 1.03378772
Iteration 197, loss = 1.03349631
Iteration 198, loss = 1.03292589
Iteration 199, loss = 1.03294175
Iteration 200, loss = 1.03211726
Iteration 201, loss = 1.03269302
Iteration 202, loss = 1.03203076
Iteration 203, loss = 1.03118882
Iteration 204, loss = 1.03072580
Iteration 205, loss = 1.03075197
Iteration 206, loss = 1.02863130
Iteration 207, loss = 1.03045681
Iteration 208, loss = 1.02955150
Iteration 209, loss = 1.03046046
Iteration 210, loss = 1.02797217
Iteration 211, loss = 1.02807452
Iteration 212, loss = 1.02711934
Iteration 213, loss = 1.02751267
Iteration 214, loss = 1.02680380
Iteration 215, loss = 1.02638458
Iteration 216, loss = 1.02764123
Iteration 217, loss = 1.02665648
Iteration 218, loss = 1.02576724
Iteration 219, loss = 1.02587263
Iteration 220, loss = 1.02570350
Iteration 221, loss = 1.02574609
Iteration 222, loss = 1.02480034
Iteration 223, loss = 1.02535963
Iteration 224, loss = 1.02427491
Iteration 225, loss = 1.02439672
Iteration 226, loss = 1.02370632
Iteration 227, loss = 1.02240893
Iteration 228, loss = 1.02387538
Iteration 229, loss = 1.02147330
Iteration 230, loss = 1.02283700
Iteration 231, loss = 1.02360277
Iteration 232, loss = 1.02165348
Iteration 233, loss = 1.02191429
Iteration 234, loss = 1.02150599
Iteration 235, loss = 1.02107779
Iteration 236, loss = 1.02285793
Iteration 237, loss = 1.02218805
Iteration 238, loss = 1.02048871
Iteration 239, loss = 1.02097715
Iteration 240, loss = 1.01912654
Iteration 241, loss = 1.01947821
Iteration 242, loss = 1.01924574
Iteration 243, loss = 1.01797720
Iteration 244, loss = 1.01846717
Iteration 245, loss = 1.01823458
Iteration 246, loss = 1.01827314
Iteration 247, loss = 1.01772186
Iteration 248, loss = 1.01644207
Iteration 249, loss = 1.01711936
Iteration 250, loss = 1.01771949
Iteration 251, loss = 1.01974530
Iteration 252, loss = 1.02018019
Iteration 253, loss = 1.01764126
Iteration 254, loss = 1.01676338
Iteration 255, loss = 1.01625573
Iteration 256, loss = 1.01493192
Iteration 257, loss = 1.01600074
Iteration 258, loss = 1.01596189
Iteration 259, loss = 1.01449007
Iteration 260, loss = 1.01702424
Iteration 261, loss = 1.01406420
Iteration 262, loss = 1.01498624
Iteration 263, loss = 1.01389573
Iteration 264, loss = 1.01282257
Iteration 265, loss = 1.01407127
Iteration 266, loss = 1.01384982
Iteration 267, loss = 1.01525060
Iteration 268, loss = 1.01376363
Iteration 269, loss = 1.01213140
Iteration 270, loss = 1.01234962
Iteration 271, loss = 1.01285216
Iteration 272, loss = 1.01207405
Iteration 273, loss = 1.01253674
Iteration 274, loss = 1.01224531
Iteration 275, loss = 1.01232972
Iteration 276, loss = 1.01150781
Iteration 277, loss = 1.01176198
Iteration 278, loss = 1.01316537
Iteration 279, loss = 1.01068766
Iteration 280, loss = 1.00964816
Iteration 281, loss = 1.00863978
Iteration 282, loss = 1.01013200
Iteration 283, loss = 1.00909582
Iteration 284, loss = 1.00900545
Iteration 285, loss = 1.01111183
Iteration 286, loss = 1.01004057
Iteration 287, loss = 1.00782741
Iteration 288, loss = 1.00842926
Iteration 289, loss = 1.00779102
Iteration 290, loss = 1.00770760
Iteration 291, loss = 1.00771807
Iteration 292, loss = 1.00811901
Iteration 293, loss = 1.00800285
Iteration 294, loss = 1.00943394
Iteration 295, loss = 1.00755538
Iteration 296, loss = 1.00699696
Iteration 297, loss = 1.00553626
Iteration 298, loss = 1.00488759
Iteration 299, loss = 1.00773875
Iteration 300, loss = 1.00879360
Iteration 301, loss = 1.00621451
Iteration 302, loss = 1.00495112
Iteration 303, loss = 1.00635996
Iteration 304, loss = 1.00565570
Iteration 305, loss = 1.00577629
Iteration 306, loss = 1.00472289
Iteration 307, loss = 1.00446508
Iteration 308, loss = 1.00373593
Iteration 309, loss = 1.00273000
Iteration 310, loss = 1.00365872
Iteration 311, loss = 1.00434226
Iteration 312, loss = 1.00316744
Iteration 313, loss = 1.00428735
Iteration 314, loss = 1.00228985
Iteration 315, loss = 1.00293545
Iteration 316, loss = 1.00450845
Iteration 317, loss = 1.00211691
Iteration 318, loss = 1.00359696
Iteration 319, loss = 1.00662362
Iteration 320, loss = 1.00568749
Iteration 321, loss = 1.00521728
Iteration 322, loss = 1.00270617
Iteration 323, loss = 1.00110165
Iteration 324, loss = 1.00189616
Iteration 325, loss = 1.00268456
Iteration 326, loss = 1.00200870
Iteration 327, loss = 1.00083260
Iteration 328, loss = 1.00193683
Iteration 329, loss = 1.00178702
Iteration 330, loss = 1.00173154
Iteration 331, loss = 1.00564032
Iteration 332, loss = 1.00240442
Iteration 333, loss = 1.00037550
Iteration 334, loss = 0.99872209
Iteration 335, loss = 0.99817023
Iteration 336, loss = 0.99717233
Iteration 337, loss = 0.99930568
Iteration 338, loss = 0.99668355
Iteration 339, loss = 0.99922804
Iteration 340, loss = 0.99811519
Iteration 341, loss = 0.99931090
Iteration 342, loss = 0.99633424
Iteration 343, loss = 0.99835554
Iteration 344, loss = 1.00069208
Iteration 345, loss = 0.99718307
Iteration 346, loss = 0.99710537
Iteration 347, loss = 0.99629017
Iteration 348, loss = 0.99870676
Iteration 349, loss = 0.99698958
Iteration 350, loss = 0.99949066
Iteration 351, loss = 0.99586602
Iteration 352, loss = 0.99464350
Iteration 353, loss = 0.99682069
Iteration 354, loss = 0.99713876
Iteration 355, loss = 0.99594814
Iteration 356, loss = 0.99688282
Iteration 357, loss = 0.99680166
Iteration 358, loss = 0.99476145
Iteration 359, loss = 0.99425359
Iteration 360, loss = 0.99514110
Iteration 361, loss = 0.99564247
Iteration 362, loss = 0.99521346
Iteration 363, loss = 0.99454175
Iteration 364, loss = 0.99591524
Iteration 365, loss = 0.99180753
Iteration 366, loss = 0.99549016
Iteration 367, loss = 0.99452901
Iteration 368, loss = 0.99321095
Iteration 369, loss = 0.99465936
Iteration 370, loss = 0.99332963
Iteration 371, loss = 0.99203479
Iteration 372, loss = 0.99298977
Iteration 373, loss = 0.99343459
Iteration 374, loss = 0.99350083
Iteration 375, loss = 0.99250918
Iteration 376, loss = 0.99292451
Iteration 377, loss = 0.99425056
Iteration 378, loss = 0.99226672
Iteration 379, loss = 0.99160208
Iteration 380, loss = 0.99238268
Iteration 381, loss = 0.99237390
Iteration 382, loss = 0.99198590
Iteration 383, loss = 0.99131080
Iteration 384, loss = 0.99231095
Iteration 385, loss = 0.99035291
Iteration 386, loss = 0.99435888
Iteration 387, loss = 0.99368064
Iteration 388, loss = 0.99085815
Iteration 389, loss = 0.99128658
Iteration 390, loss = 0.98981218
Iteration 391, loss = 0.99363474
Iteration 392, loss = 0.98982299
Iteration 393, loss = 0.98931272
Iteration 394, loss = 0.99130481
Iteration 395, loss = 0.99152533
Iteration 396, loss = 0.98992946
Iteration 397, loss = 0.99084666
Iteration 398, loss = 0.98957042
Iteration 399, loss = 0.99238828
Iteration 400, loss = 0.98950466
Iteration 401, loss = 0.99284506
Iteration 402, loss = 0.99044378
Iteration 403, loss = 0.98902059
Iteration 404, loss = 0.98747316
Iteration 405, loss = 0.98862271
Iteration 406, loss = 0.98864331
Iteration 407, loss = 0.99007426
Iteration 408, loss = 0.98854933
Iteration 409, loss = 0.98813690
Iteration 410, loss = 0.98726740
Iteration 411, loss = 0.98705733
Iteration 412, loss = 0.98910878
Iteration 413, loss = 0.99068021
Iteration 414, loss = 0.98551926
Iteration 415, loss = 0.98743024
Iteration 416, loss = 0.99008202
Iteration 417, loss = 0.98562958
Iteration 418, loss = 0.99000531
Iteration 419, loss = 0.98564586
Iteration 420, loss = 0.98612097
Iteration 421, loss = 0.98576646
Iteration 422, loss = 0.98705841
Iteration 423, loss = 0.98635679
Iteration 424, loss = 0.98651843
Iteration 425, loss = 0.98555745
Iteration 426, loss = 0.98532474
Iteration 427, loss = 0.98608918
Iteration 428, loss = 0.98532699
Iteration 429, loss = 0.98696407
Iteration 430, loss = 0.98454131
Iteration 431, loss = 0.98533472
Iteration 432, loss = 0.98487191
Iteration 433, loss = 0.98504423
Iteration 434, loss = 0.98505397
Iteration 435, loss = 0.98584425
Iteration 436, loss = 0.98575716
Iteration 437, loss = 0.98530870
Iteration 438, loss = 0.98387779
Iteration 439, loss = 0.98507633
Iteration 440, loss = 0.98367948
Iteration 441, loss = 0.98884451
Iteration 442, loss = 0.98335899
Iteration 443, loss = 0.98416229
Iteration 444, loss = 0.98212860
Iteration 445, loss = 0.98464322
Iteration 446, loss = 0.98454897
Iteration 447, loss = 0.98657045
Iteration 448, loss = 0.98313984
Iteration 449, loss = 0.98232206
Iteration 450, loss = 0.98187532
Iteration 451, loss = 0.98931334
Iteration 452, loss = 0.98472889
Iteration 453, loss = 0.98194329
Iteration 454, loss = 0.98277759
Iteration 455, loss = 0.98148152
Iteration 456, loss = 0.98205693
Iteration 457, loss = 0.98547997
Iteration 458, loss = 0.98068267
Iteration 459, loss = 0.97953766
Iteration 460, loss = 0.98219997
Iteration 461, loss = 0.98178057
Iteration 462, loss = 0.98241998
Iteration 463, loss = 0.98002538
Iteration 464, loss = 0.98066519
Iteration 465, loss = 0.98198339
Iteration 466, loss = 0.98385357
Iteration 467, loss = 0.98188733
Iteration 468, loss = 0.97955240
Iteration 469, loss = 0.98084324
Iteration 470, loss = 0.98114757
Iteration 471, loss = 0.98100944
Iteration 472, loss = 0.98046220
Iteration 473, loss = 0.98422974
Iteration 474, loss = 0.98271314
Iteration 475, loss = 0.97986761
Iteration 476, loss = 0.97912511
Iteration 477, loss = 0.98051722
Iteration 478, loss = 0.98089892
Iteration 479, loss = 0.98083700
Iteration 480, loss = 0.97855722
Iteration 481, loss = 0.97936555
Iteration 482, loss = 0.98207182
Iteration 483, loss = 0.97861374
Iteration 484, loss = 0.97780702
Iteration 485, loss = 0.98061196
Iteration 486, loss = 0.98031000
Iteration 487, loss = 0.98011303
Iteration 488, loss = 0.97930027
Iteration 489, loss = 0.98061772
Iteration 490, loss = 0.97811784
Iteration 491, loss = 0.97886284
Iteration 492, loss = 0.97761745
Iteration 493, loss = 0.97666781
Iteration 494, loss = 0.97809086
Iteration 495, loss = 0.97693711
Iteration 496, loss = 0.98232263
Iteration 497, loss = 0.97915935
Iteration 498, loss = 0.98038072
Iteration 499, loss = 0.97758434
Iteration 500, loss = 0.97745351
Iteration 501, loss = 0.97570297
Iteration 502, loss = 0.97931092
Iteration 503, loss = 0.97829706
Iteration 504, loss = 0.97624235
Iteration 505, loss = 0.97700138
Iteration 506, loss = 0.97517153
Iteration 507, loss = 0.97732112
Iteration 508, loss = 0.97884351
Iteration 509, loss = 0.97843165
Iteration 510, loss = 0.97635112
Iteration 511, loss = 0.97532810
Iteration 512, loss = 0.97724079
Iteration 513, loss = 0.97580934
Iteration 514, loss = 0.97409689
Iteration 515, loss = 0.97674897
Iteration 516, loss = 0.97580840
Iteration 517, loss = 0.97531113
Iteration 518, loss = 0.97500212
Iteration 519, loss = 0.97815247
Iteration 520, loss = 0.97320555
Iteration 521, loss = 0.97675637
Iteration 522, loss = 0.97714358
Iteration 523, loss = 0.97574527
Iteration 524, loss = 0.97560734
Iteration 525, loss = 0.97541665
Iteration 526, loss = 0.97449760
Iteration 527, loss = 0.97637487
Iteration 528, loss = 0.97492094
Iteration 529, loss = 0.97482207
Iteration 530, loss = 0.97452260
Iteration 531, loss = 0.99339122
Iteration 532, loss = 0.97852285
Iteration 533, loss = 0.97602815
Iteration 534, loss = 0.97308232
Iteration 535, loss = 0.97683613
Iteration 536, loss = 0.97130142
Iteration 537, loss = 0.97267021
Iteration 538, loss = 0.97205517
Iteration 539, loss = 0.97416102
Iteration 540, loss = 0.97423532
Iteration 541, loss = 0.97322640
Iteration 542, loss = 0.97463237
Iteration 543, loss = 0.97564325
Iteration 544, loss = 0.97362061
Iteration 545, loss = 0.97230006
Iteration 546, loss = 0.97370073
Iteration 547, loss = 0.98103609
Iteration 548, loss = 0.97402316
Iteration 549, loss = 0.97435897
Iteration 550, loss = 0.97307889
Iteration 551, loss = 0.97462841
Iteration 552, loss = 0.97274508
Iteration 553, loss = 0.97186930
Iteration 554, loss = 0.97407266
Iteration 555, loss = 0.97294868
Iteration 556, loss = 0.97337304
Iteration 557, loss = 0.97276428
Iteration 558, loss = 0.98550940
Iteration 559, loss = 0.97913462
Iteration 560, loss = 0.97257470
Iteration 561, loss = 0.97085040
Iteration 562, loss = 0.97098808
Iteration 563, loss = 0.97394868
Iteration 564, loss = 0.97275056
Iteration 565, loss = 0.97116302
Iteration 566, loss = 0.97049549
Iteration 567, loss = 0.97118758
Iteration 568, loss = 0.97439034
Iteration 569, loss = 0.97067366
Iteration 570, loss = 0.97160445
Iteration 571, loss = 0.97148313
Iteration 572, loss = 0.97071143
Iteration 573, loss = 0.97221332
Iteration 574, loss = 0.97117245
Iteration 575, loss = 0.97011267
Iteration 576, loss = 0.97223107
Iteration 577, loss = 0.97211199
Iteration 578, loss = 0.97043939
Iteration 579, loss = 0.97318721
Iteration 580, loss = 0.97030344
Iteration 581, loss = 0.96877787
Iteration 582, loss = 0.97008379
Iteration 583, loss = 0.97243644
Iteration 584, loss = 0.97359353
Iteration 585, loss = 0.97478189
Iteration 586, loss = 0.96726432
Iteration 587, loss = 0.96682858
Iteration 588, loss = 0.97061515
Iteration 589, loss = 0.97081509
Iteration 590, loss = 0.97017221
Iteration 591, loss = 0.96988694
Iteration 592, loss = 0.96767235
Iteration 593, loss = 0.96799037
Iteration 594, loss = 0.96983270
Iteration 595, loss = 0.96792479
Iteration 596, loss = 0.98373754
Iteration 597, loss = 0.97083876
Iteration 598, loss = 0.96608976
Iteration 599, loss = 0.96927340
Iteration 600, loss = 0.96776851
Iteration 601, loss = 0.96806292
Iteration 602, loss = 0.96898341
Iteration 603, loss = 0.96807147
Iteration 604, loss = 0.96903525
Iteration 605, loss = 0.96715067
Iteration 606, loss = 0.96936257
Iteration 607, loss = 0.96604903
Iteration 608, loss = 0.96792325
Iteration 609, loss = 0.96698188
Iteration 610, loss = 0.97183809
Iteration 611, loss = 0.96877588
Iteration 612, loss = 0.96854961
Iteration 613, loss = 0.96716663
Iteration 614, loss = 0.96497956
Iteration 615, loss = 0.97260538
Iteration 616, loss = 0.96431293
Iteration 617, loss = 0.96901245
Iteration 618, loss = 0.96845091
Iteration 619, loss = 0.96609581
Iteration 620, loss = 0.96680235
Iteration 621, loss = 0.96632626
Iteration 622, loss = 0.96656775
Iteration 623, loss = 0.96741559
Iteration 624, loss = 0.96812009
Iteration 625, loss = 0.96544124
Iteration 626, loss = 0.96986298
Iteration 627, loss = 0.96383274
Iteration 628, loss = 0.96681138
Iteration 629, loss = 0.96799711
Iteration 630, loss = 0.96711364
Iteration 631, loss = 0.96576130
Iteration 632, loss = 0.96603035
Iteration 633, loss = 0.96426974
Iteration 634, loss = 0.96616267
Iteration 635, loss = 0.97200516
Iteration 636, loss = 0.96931439
Iteration 637, loss = 0.96380767
Iteration 638, loss = 0.96275017
Iteration 639, loss = 0.96468921
Iteration 640, loss = 0.96522451
Iteration 641, loss = 0.97078981
Iteration 642, loss = 0.96654920
Iteration 643, loss = 0.96457088
Iteration 644, loss = 0.96435877
Iteration 645, loss = 0.96673729
Iteration 646, loss = 0.96728273
Iteration 647, loss = 0.97030719
Iteration 648, loss = 0.96423510
Iteration 649, loss = 0.96192855
Iteration 650, loss = 0.96395923
Iteration 651, loss = 0.96488427
Iteration 652, loss = 0.96992149
Iteration 653, loss = 0.96534997
Iteration 654, loss = 0.96406104
Iteration 655, loss = 0.96643988
Iteration 656, loss = 0.96371438
Iteration 657, loss = 0.96571433
Iteration 658, loss = 0.96739554
Iteration 659, loss = 0.96327486
Iteration 660, loss = 0.96256369
Iteration 661, loss = 0.96665455
Iteration 662, loss = 0.96217797
Iteration 663, loss = 0.96300894
Iteration 664, loss = 0.96923877
Iteration 665, loss = 0.96514320
Iteration 666, loss = 0.96616411
Iteration 667, loss = 0.96495448
Iteration 668, loss = 0.96436207
Iteration 669, loss = 0.96229918
Iteration 670, loss = 0.96225512
Iteration 671, loss = 0.96514793
Iteration 672, loss = 0.96256004
Iteration 673, loss = 0.96408105
Iteration 674, loss = 0.96479374
Iteration 675, loss = 0.96331645
Iteration 676, loss = 0.96120422
Iteration 677, loss = 0.96121228
Iteration 678, loss = 0.96168957
Iteration 679, loss = 0.96491569
Iteration 680, loss = 0.96250787
Iteration 681, loss = 0.96179837
Iteration 682, loss = 0.96301576
Iteration 683, loss = 0.96228321
Iteration 684, loss = 0.96247492
Iteration 685, loss = 0.96300686
Iteration 686, loss = 0.96103285
Iteration 687, loss = 0.96197987
Iteration 688, loss = 0.96230110
Iteration 689, loss = 0.96038260
Iteration 690, loss = 0.96125040
Iteration 691, loss = 0.96051707
Iteration 692, loss = 0.96178429
Iteration 693, loss = 0.96210895
Iteration 694, loss = 0.96307371
Iteration 695, loss = 0.96134160
Iteration 696, loss = 0.96561070
Iteration 697, loss = 0.96169933
Iteration 698, loss = 0.95993460
Iteration 699, loss = 0.96537400
Iteration 700, loss = 0.96181136
Iteration 701, loss = 0.96057084
Iteration 702, loss = 0.95981215
Iteration 703, loss = 0.95988821
Iteration 704, loss = 0.96049978
Iteration 705, loss = 0.96024194
Iteration 706, loss = 0.95868047
Iteration 707, loss = 0.96407473
Iteration 708, loss = 0.96446380
Iteration 709, loss = 0.96637441
Iteration 710, loss = 0.96062646
Iteration 711, loss = 0.96165940
Iteration 712, loss = 0.95825928
Iteration 713, loss = 0.96007933
Iteration 714, loss = 0.95845648
Iteration 715, loss = 0.96100023
Iteration 716, loss = 0.96175926
Iteration 717, loss = 0.96089884
Iteration 718, loss = 0.96324838
Iteration 719, loss = 0.95827047
Iteration 720, loss = 0.95734903
Iteration 721, loss = 0.95964490
Iteration 722, loss = 0.96304779
Iteration 723, loss = 0.96033898
Iteration 724, loss = 0.95805705
Iteration 725, loss = 0.96369232
Iteration 726, loss = 0.96362911
Iteration 727, loss = 0.95872622
Iteration 728, loss = 0.96212201
Iteration 729, loss = 0.96047286
Iteration 730, loss = 0.95981492
Iteration 731, loss = 0.96088052
Iteration 732, loss = 0.95978449
Iteration 733, loss = 0.95936618
Iteration 734, loss = 0.95891941
Iteration 735, loss = 0.96163373
Iteration 736, loss = 0.96395609
Iteration 737, loss = 0.95746241
Iteration 738, loss = 0.95611409
Iteration 739, loss = 0.95731304
Iteration 740, loss = 0.95878806
Iteration 741, loss = 0.96583497
Iteration 742, loss = 0.96090425
Iteration 743, loss = 0.96025153
Iteration 744, loss = 0.95786941
Iteration 745, loss = 0.95621671
Iteration 746, loss = 0.95935992
Iteration 747, loss = 0.95691073
Iteration 748, loss = 0.95757957
Iteration 749, loss = 0.96271971
Iteration 750, loss = 0.96065024
Iteration 751, loss = 0.95765979
Iteration 752, loss = 0.95824396
Iteration 753, loss = 0.95941396
Iteration 754, loss = 0.96016996
Iteration 755, loss = 0.96405551
Iteration 756, loss = 0.96208937
Iteration 757, loss = 0.95992965
Iteration 758, loss = 0.95570506
Iteration 759, loss = 0.95622737
Iteration 760, loss = 0.95810573
Iteration 761, loss = 0.95757574
Iteration 762, loss = 0.95564320
Iteration 763, loss = 0.95917253
Iteration 764, loss = 0.95796832
Iteration 765, loss = 0.96836469
Iteration 766, loss = 0.95883179
Iteration 767, loss = 0.95967417
Iteration 768, loss = 0.95909964
Iteration 769, loss = 0.95848308
Iteration 770, loss = 0.95825666
Iteration 771, loss = 0.95881348
Iteration 772, loss = 0.96071532
Iteration 773, loss = 0.95869658
Iteration 774, loss = 0.95985350
Iteration 775, loss = 0.95895232
Iteration 776, loss = 0.95674190
Iteration 777, loss = 0.95811786
Iteration 778, loss = 0.95827756
Iteration 779, loss = 0.96069012
Iteration 780, loss = 0.95686907
Iteration 781, loss = 0.95413799
Iteration 782, loss = 0.95571093
Iteration 783, loss = 0.95708236
Iteration 784, loss = 0.95669916
Iteration 785, loss = 0.95738822
Iteration 786, loss = 0.95678220
Iteration 787, loss = 0.95576547
Iteration 788, loss = 0.95763468
Iteration 789, loss = 0.95903518
Iteration 790, loss = 0.95603462
Iteration 791, loss = 0.95868709
Iteration 792, loss = 0.95344084
Iteration 793, loss = 0.95739098
Iteration 794, loss = 0.95496932
Iteration 795, loss = 0.95551714
Iteration 796, loss = 0.95856204
Iteration 797, loss = 0.95517915
Iteration 798, loss = 0.95629457
Iteration 799, loss = 0.95561760
Iteration 800, loss = 0.95559318
Iteration 801, loss = 0.95473840
Iteration 802, loss = 0.95788190
Iteration 803, loss = 0.95736380
Iteration 804, loss = 0.95770359
Iteration 805, loss = 0.95696434
Iteration 806, loss = 0.95324998
Iteration 807, loss = 0.95736242
Iteration 808, loss = 0.95836191
Iteration 809, loss = 0.95327325
Iteration 810, loss = 0.95369478
Iteration 811, loss = 0.95378183
Iteration 812, loss = 0.95433968
Iteration 813, loss = 0.95632250
Iteration 814, loss = 0.95498170
Iteration 815, loss = 0.95478973
Iteration 816, loss = 0.95403654
Iteration 817, loss = 0.95308105
Iteration 818, loss = 0.95171833
Iteration 819, loss = 0.95766241
Iteration 820, loss = 0.95528185
Iteration 821, loss = 0.95391841
Iteration 822, loss = 0.95508203
Iteration 823, loss = 0.95504502
Iteration 824, loss = 0.95552004
Iteration 825, loss = 0.95967881
Iteration 826, loss = 0.95417388
Iteration 827, loss = 0.95074056
Iteration 828, loss = 0.95035048
Iteration 829, loss = 0.95618663
Iteration 830, loss = 0.95951634
Iteration 831, loss = 0.95119299
Iteration 832, loss = 0.95132272
Iteration 833, loss = 0.95444678
Iteration 834, loss = 0.95790596
Iteration 835, loss = 0.95447477
Iteration 836, loss = 0.95402139
Iteration 837, loss = 0.95220093
Iteration 838, loss = 0.95212948
Iteration 839, loss = 0.95779971
Iteration 840, loss = 0.95533843
Iteration 841, loss = 0.95378069
Iteration 842, loss = 0.94971552
Iteration 843, loss = 0.95107341
Iteration 844, loss = 0.95390567
Iteration 845, loss = 0.95760361
Iteration 846, loss = 0.95526078
Iteration 847, loss = 0.95445978
Iteration 848, loss = 0.95235797
Iteration 849, loss = 0.95256678
Iteration 850, loss = 0.95191873
Iteration 851, loss = 0.95281916
Iteration 852, loss = 0.95287419
Iteration 853, loss = 0.95266077
Iteration 854, loss = 0.95297615
Iteration 855, loss = 0.95409058
Iteration 856, loss = 0.95481132
Iteration 857, loss = 0.95543772
Iteration 858, loss = 0.94992507
Iteration 859, loss = 0.95386613
Iteration 860, loss = 0.95155044
Iteration 861, loss = 0.95048306
Iteration 862, loss = 0.95400506
Iteration 863, loss = 0.95685831
Iteration 864, loss = 0.95313773
Iteration 865, loss = 0.95055493
Iteration 866, loss = 0.94908949
Iteration 867, loss = 0.95586962
Iteration 868, loss = 0.95372943
Iteration 869, loss = 0.95247285
Iteration 870, loss = 0.95788666
Iteration 871, loss = 0.95294041
Iteration 872, loss = 0.95978346
Iteration 873, loss = 0.95312440
Iteration 874, loss = 0.95333451
Iteration 875, loss = 0.94959512
Iteration 876, loss = 0.94982775
Iteration 877, loss = 0.94965943
Iteration 878, loss = 0.95212885
Iteration 879, loss = 0.95413283
Iteration 880, loss = 0.95397950
Iteration 881, loss = 0.94976527
Iteration 882, loss = 0.95251817
Iteration 883, loss = 0.95011802
Iteration 884, loss = 0.94895472
Iteration 885, loss = 0.95326543
Iteration 886, loss = 0.94893769
Iteration 887, loss = 0.95603864
Iteration 888, loss = 0.95383445
Iteration 889, loss = 0.94975378
Iteration 890, loss = 0.94993035
Iteration 891, loss = 0.95195762
Iteration 892, loss = 0.95844972
Iteration 893, loss = 0.94944678
Iteration 894, loss = 0.95064389
Iteration 895, loss = 0.94902884
Iteration 896, loss = 0.95002580
Iteration 897, loss = 0.95953787
Iteration 898, loss = 0.95215912
Iteration 899, loss = 0.94778510
Iteration 900, loss = 0.94985311
Iteration 901, loss = 0.95141796
Iteration 902, loss = 0.94762155
Iteration 903, loss = 0.94948003
Iteration 904, loss = 0.95218773
Iteration 905, loss = 0.95406503
Iteration 906, loss = 0.94919802
Iteration 907, loss = 0.95183999
Iteration 908, loss = 0.94832895
Iteration 909, loss = 0.95477791
Iteration 910, loss = 0.95196658
Iteration 911, loss = 0.94605453
Iteration 912, loss = 0.94988448
Iteration 913, loss = 0.95431598
Iteration 914, loss = 0.95145045
Iteration 915, loss = 0.94843523
Iteration 916, loss = 0.94849248
Iteration 917, loss = 0.94728257
Iteration 918, loss = 0.95229570
Iteration 919, loss = 0.95226867
Iteration 920, loss = 0.95043476
Iteration 921, loss = 0.94798330
Iteration 922, loss = 0.95221865
Iteration 923, loss = 0.94770476
Iteration 924, loss = 0.94810415
Iteration 925, loss = 0.95427917
Iteration 926, loss = 0.94938989
Iteration 927, loss = 0.95012598
Iteration 928, loss = 0.95096376
Iteration 929, loss = 0.94933504
Iteration 930, loss = 0.94819340
Iteration 931, loss = 0.94786895
Iteration 932, loss = 0.94610640
Iteration 933, loss = 0.95118856
Iteration 934, loss = 0.95190303
Iteration 935, loss = 0.94902702
Iteration 936, loss = 0.94747749
Iteration 937, loss = 0.94872279
Iteration 938, loss = 0.94658823
Iteration 939, loss = 0.95022221
Iteration 940, loss = 0.94623700
Iteration 941, loss = 0.94620242
Iteration 942, loss = 0.94773136
Iteration 943, loss = 0.96376294
Iteration 944, loss = 0.95141911
Iteration 945, loss = 0.94543027
Iteration 946, loss = 0.95182442
Iteration 947, loss = 0.94513832
Iteration 948, loss = 0.94519869
Iteration 949, loss = 0.94819082
Iteration 950, loss = 0.94982362
Iteration 951, loss = 0.95075012
Iteration 952, loss = 0.94537101
Iteration 953, loss = 0.94786442
Iteration 954, loss = 0.94994196
Iteration 955, loss = 0.94928750
Iteration 956, loss = 0.94495266
Iteration 957, loss = 0.94825147
Iteration 958, loss = 0.94765257
Iteration 959, loss = 0.94827578
Iteration 960, loss = 0.95020176
Iteration 961, loss = 0.95526433
Iteration 962, loss = 0.94468167
Iteration 963, loss = 0.94692758
Iteration 964, loss = 0.94554306
Iteration 965, loss = 0.95135177
Iteration 966, loss = 0.94947833
Iteration 967, loss = 0.94728780
Iteration 968, loss = 0.94726523
Iteration 969, loss = 0.94826572
Iteration 970, loss = 0.94815294
Iteration 971, loss = 0.94585282
Iteration 972, loss = 0.94611007
Iteration 973, loss = 0.94919643
Iteration 974, loss = 0.94708304
Iteration 975, loss = 0.95878057
Iteration 976, loss = 0.94514645
Iteration 977, loss = 0.94539509
Iteration 978, loss = 0.94450988
Iteration 979, loss = 0.95014344
Iteration 980, loss = 0.95023596
Iteration 981, loss = 0.94513496
Iteration 982, loss = 0.94756331
Iteration 983, loss = 0.94350910
Iteration 984, loss = 0.95201986
Iteration 985, loss = 0.95114107
Iteration 986, loss = 0.94510287
Iteration 987, loss = 0.94747280
Iteration 988, loss = 0.94425552
Iteration 989, loss = 0.94522645
Iteration 990, loss = 0.94763969
Iteration 991, loss = 0.94514439
Iteration 992, loss = 0.94815275
Iteration 993, loss = 0.95311609
Iteration 994, loss = 0.94729790
Iteration 995, loss = 0.94438312
Iteration 996, loss = 0.94630248
Iteration 997, loss = 0.94849523
Iteration 998, loss = 0.94650992
Iteration 999, loss = 0.94496459
Iteration 1000, loss = 0.95560568
precision recall f1-score support
0.0 0.40 0.35 0.37 279
1.0 0.31 0.32 0.31 253
2.0 0.38 0.41 0.39 298
accuracy 0.36 830
macro avg 0.36 0.36 0.36 830
weighted avg 0.36 0.36 0.36 830
C:\Users\Bruger\AppData\Roaming\Python\Python310\site-packages\sklearn\neural_network\_multilayer_perceptron.py:692: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet. warnings.warn(
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.99)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,),learning_rate='invscaling',learning_rate_init=0.0005, n_iter_no_change=50, max_iter=1000)
# rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))
Iteration 1, loss = 2.24735406
Iteration 2, loss = 1.21552970
Iteration 3, loss = 1.19370462
Iteration 4, loss = 1.21115846
Iteration 5, loss = 1.17688781
Iteration 6, loss = 1.18027952
Iteration 7, loss = 1.23369810
Iteration 8, loss = 1.18573559
Iteration 9, loss = 1.19624951
Iteration 10, loss = 1.21515312
Iteration 11, loss = 1.16470400
Iteration 12, loss = 1.18113593
Iteration 13, loss = 1.20423397
Iteration 14, loss = 1.17088834
Iteration 15, loss = 1.18582957
Iteration 16, loss = 1.16356575
Iteration 17, loss = 1.20478582
Iteration 18, loss = 1.18247954
Iteration 19, loss = 1.20287715
Iteration 20, loss = 1.19825816
Iteration 21, loss = 1.17343568
Iteration 22, loss = 1.21634638
Iteration 23, loss = 1.16178581
Iteration 24, loss = 1.18911451
Iteration 25, loss = 1.21030443
Iteration 26, loss = 1.18205776
Iteration 27, loss = 1.18256996
Iteration 28, loss = 1.18050753
Iteration 29, loss = 1.19175628
Iteration 30, loss = 1.17336852
Iteration 31, loss = 1.16042532
Iteration 32, loss = 1.21768535
Iteration 33, loss = 1.18327870
Iteration 34, loss = 1.18122359
Iteration 35, loss = 1.17272692
Iteration 36, loss = 1.19471285
Iteration 37, loss = 1.18404735
Iteration 38, loss = 1.16932872
Iteration 39, loss = 1.19766431
Iteration 40, loss = 1.15939963
Iteration 41, loss = 1.20109253
Iteration 42, loss = 1.19275179
Iteration 43, loss = 1.15824389
Iteration 44, loss = 1.16985967
Iteration 45, loss = 1.17186164
Iteration 46, loss = 1.21131221
Iteration 47, loss = 1.20282030
Iteration 48, loss = 1.17930497
Iteration 49, loss = 1.18649115
Iteration 50, loss = 1.16293083
Iteration 51, loss = 1.15588926
Iteration 52, loss = 1.18370429
Iteration 53, loss = 1.18158868
Iteration 54, loss = 1.17257321
Iteration 55, loss = 1.17130212
Iteration 56, loss = 1.17730304
Iteration 57, loss = 1.16594857
Iteration 58, loss = 1.16755209
Iteration 59, loss = 1.18639177
Iteration 60, loss = 1.17229275
Iteration 61, loss = 1.18851070
Iteration 62, loss = 1.20323167
Iteration 63, loss = 1.16724971
Iteration 64, loss = 1.18784351
Iteration 65, loss = 1.18602152
Iteration 66, loss = 1.16949703
Iteration 67, loss = 1.14618922
Iteration 68, loss = 1.17081160
Iteration 69, loss = 1.18872349
Iteration 70, loss = 1.17506416
Iteration 71, loss = 1.18032379
Iteration 72, loss = 1.17302062
Iteration 73, loss = 1.17010463
Iteration 74, loss = 1.15879612
Iteration 75, loss = 1.17953721
Iteration 76, loss = 1.17128939
Iteration 77, loss = 1.17689344
Iteration 78, loss = 1.15931033
Iteration 79, loss = 1.17813790
Iteration 80, loss = 1.17955940
Iteration 81, loss = 1.17565941
Iteration 82, loss = 1.17165024
Iteration 83, loss = 1.17217349
Iteration 84, loss = 1.17035018
Iteration 85, loss = 1.18987476
Iteration 86, loss = 1.15939745
Iteration 87, loss = 1.17402584
Iteration 88, loss = 1.16124755
Iteration 89, loss = 1.15575825
Iteration 90, loss = 1.16658888
Iteration 91, loss = 1.15050259
Iteration 92, loss = 1.16886458
Iteration 93, loss = 1.17024630
Iteration 94, loss = 1.16159329
Iteration 95, loss = 1.17196056
Iteration 96, loss = 1.17526487
Iteration 97, loss = 1.18245704
Iteration 98, loss = 1.18631733
Iteration 99, loss = 1.19449147
Iteration 100, loss = 1.15374104
Iteration 101, loss = 1.15524783
Iteration 102, loss = 1.15317664
Iteration 103, loss = 1.17060170
Iteration 104, loss = 1.19585260
Iteration 105, loss = 1.17558026
Iteration 106, loss = 1.18021167
Iteration 107, loss = 1.17267092
Iteration 108, loss = 1.19215996
Iteration 109, loss = 1.18197683
Iteration 110, loss = 1.15782534
Iteration 111, loss = 1.17584200
Iteration 112, loss = 1.15206897
Iteration 113, loss = 1.16216539
Iteration 114, loss = 1.15278872
Iteration 115, loss = 1.16518025
Iteration 116, loss = 1.16373192
Iteration 117, loss = 1.14691320
Iteration 118, loss = 1.16267827
Training loss did not improve more than tol=0.000100 for 50 consecutive epochs. Stopping.
precision recall f1-score support
0.0 0.35 0.39 0.36 279
1.0 0.33 0.45 0.38 257
2.0 0.35 0.20 0.25 294
accuracy 0.34 830
macro avg 0.34 0.34 0.33 830
weighted avg 0.34 0.34 0.33 830
pca_3d = PCA(n_components=3)
clustered_dataset_3d = pca_3d.fit_transform(df_balanced)
def plot3dwithspike(width, height, title, datapoints, myLabel=None) :
plt.figure(figsize=(width,height))
plt.title(title, fontsize='medium')
ax = plt.axes(projection='3d')
ax.scatter3D(datapoints[:, 0], datapoints[:,1], datapoints[:,2], c=myLabel, marker='o', s=15, edgecolor='k')
plt.show()
plot3dwithspike(20,20,"PCA3 donation data",clustered_dataset_3d,balanced_labels)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.99)
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(novelty=True)
clf.fit(X_train)
clf.predict(X_test)
C:\Users\Bruger\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])