import numpy as np
import pandas as pd
data = pd.read_csv('teledata_done.csv',sep=';', encoding='iso-8859-1')
data.head()
C:\Users\Bruger\AppData\Local\Temp\ipykernel_14744\4022911079.py:3: DtypeWarning: Columns (16,17,18,19,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False. data = pd.read_csv('teledata_done.csv',sep=';', encoding='iso-8859-1')
baseid | fornavn | efternavn | adresse | stednavn | postnummer | bynavn | kommunekode | vej | husnr | ... | ssh_anden_hustype | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 65536878 | ec107270-e365-41aa-82f7-ac7716e0a06f | 38c49621-2e54-4ae2-90ac-c54a47a657ec | Søbredden 24 | Svogerslev | 4000 | Roskilde | 265 | Søbredden | 24 | ... | 0,25 | 0 | 0 | 0,75 | 0 | 29.0 | 1196.0 | Ejer | Parcelhus | 5.0 |
1 | 63503287 | 5f229333-fe30-404d-a8f7-881d082fbd87 | 679d6544-5a01-4cb5-bc73-ccc936f50482 | Jacob Appels Alle 36 | NaN | 2770 | Kastrup | 185 | Jacob Appels Alle | 36 | ... | 0,121212121212121 | 0 | 0,121212121212121 | 0,757575757575758 | 0 | 35.0 | 569.0 | Ejer | Række/kæde/dobbelthus | 5.0 |
2 | 62199324 | 1a25f102-40f6-46c5-9ed7-f79259118aca | 0697dcfd-103d-454f-b8c4-f3e22abc6f68 | Højbovænge 8 | NaN | 4660 | Store Heddinge | 336 | Højbovænge | 8 | ... | 0 | 0,111111111111111 | 0,444444444444444 | 0,111111111111111 | 0,333333333333333 | 19.0 | 302.0 | Lejer | Række/kæde/dobbelthus | 3.0 |
3 | 64841017 | 348dc61c-209c-4b77-a3fe-9cde68ff65f6 | a18f669c-9bf6-498a-84bf-783a7b93b38d | Engdraget 2 | Gårslev | 7080 | Børkop | 630 | Engdraget | 2 | ... | 0,2 | 0 | 0 | 0,4 | 0,4 | 43.0 | 1747.0 | Ejer | Parcelhus | 7.0 |
4 | 10055668 | a64ff2ca-bb9d-46ab-8570-28bd39ebde27 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Irisvej 9 | NaN | 3300 | Frederiksværk | 260 | Irisvej | 9 | ... | 0,333333333333333 | 0 | 0 | 0,333333333333333 | 0,333333333333333 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 |
5 rows × 66 columns
for column in data.columns:
print('Total number of unique entities in ' +str(column)+ ' Column is == ' + str(len(data[column].unique())))
# data[column].unique()
Total number of unique entities in baseid Column is == 2182677 Total number of unique entities in fornavn Column is == 875079 Total number of unique entities in efternavn Column is == 136889 Total number of unique entities in adresse Column is == 1394662 Total number of unique entities in stednavn Column is == 6231 Total number of unique entities in postnummer Column is == 1060 Total number of unique entities in bynavn Column is == 611 Total number of unique entities in kommunekode Column is == 102 Total number of unique entities in vej Column is == 45149 Total number of unique entities in husnr Column is == 988 Total number of unique entities in bogstav Column is == 27 Total number of unique entities in sal Column is == 103 Total number of unique entities in side Column is == 2958 Total number of unique entities in kvhx Column is == 1761391 Total number of unique entities in robinson Column is == 4 Total number of unique entities in tlfnr1 Column is == 2173321 Total number of unique entities in tlfnr2 Column is == 643562 Total number of unique entities in tlfnr3 Column is == 194333 Total number of unique entities in tlfnr4 Column is == 62137 Total number of unique entities in tlfnr5 Column is == 18420 Total number of unique entities in tlftype1 Column is == 3 Total number of unique entities in tlftype2 Column is == 3 Total number of unique entities in tlftype3 Column is == 3 Total number of unique entities in tlftype4 Column is == 3 Total number of unique entities in tlftype5 Column is == 3 Total number of unique entities in koen Column is == 3 Total number of unique entities in alder Column is == 82 Total number of unique entities in mosaic_gruppe Column is == 13 Total number of unique entities in mosaic_type Column is == 45 Total number of unique entities in ssh_0_born Column is == 360 Total number of unique entities in ssh_1_barn Column is == 239 Total number of unique entities in ssh_2_born Column is == 259 Total number of unique entities in ssh_3_plus_born Column is == 432 Total number of unique entities in udd_grundskole Column is == 2464 Total number of unique entities in udd_almen_gymnasial Column is == 2433 Total number of unique entities in udd_erhvervsgymasial Column is == 2 Total number of unique entities in udd_erhvervsfaglig_forloeb Column is == 3019 Total number of unique entities in udd_kort_videregaaende Column is == 1449 Total number of unique entities in udd_mellemlang_videregaaende Column is == 2730 Total number of unique entities in udd_bachelor Column is == 1643 Total number of unique entities in udd_lang_videregaaende Column is == 1319 Total number of unique entities in udd_forsker Column is == 1319 Total number of unique entities in udd_uoplyst Column is == 1236 Total number of unique entities in socio_high_selvst Column is == 782 Total number of unique entities in socio_mellemniveau Column is == 252 Total number of unique entities in socio_grundniveau Column is == 617 Total number of unique entities in socio_ledig_kontant Column is == 338 Total number of unique entities in socio_pensionist Column is == 890 Total number of unique entities in socio_other Column is == 443 Total number of unique entities in civilstand_ugift Column is == 324 Total number of unique entities in civilstand_gift Column is == 317 Total number of unique entities in civilstand_skilt Column is == 252 Total number of unique entities in civilstand_enke Column is == 289 Total number of unique entities in okonomisk_formaaen Column is == 6 Total number of unique entities in antal_beboere Column is == 89 Total number of unique entities in husstandsindkomst Column is == 299818 Total number of unique entities in ssh_anden_hustype Column is == 296 Total number of unique entities in ssh_enlig_m_born Column is == 252 Total number of unique entities in ssh_enlig_u_born Column is == 408 Total number of unique entities in ssh_par_m_born Column is == 344 Total number of unique entities in ssh_par_u_born Column is == 342 Total number of unique entities in donation_ssh Column is == 91 Total number of unique entities in donation_gns Column is == 8429 Total number of unique entities in ejerforhold Column is == 7 Total number of unique entities in enhedsanvendelse Column is == 7 Total number of unique entities in antal vaerelser Column is == 112
Unique Key for each lead in the dataset
The first name for each lead. This column do contain both first and middlename for the lead
The lastname for each lead. This column can contain both middle and lastname for the lead - but will in most cases only be the lastname
The adress of the leads. This cell contains following: Streetname, Housenumber, Letter, Floor, Which side of the floor the lead lives on
Please ask the person. This feature expanation is missing
the zipcode wherein the lead lives. The zipcode is always 4 digits
The name of the city which the address belongs to.
municipalitycode. Numeric identifier if the municipality of the address is situated in. The municipoalitycode herein is only 3 digits.
The streetname of the address
The housenumber in the address
if there is a letter belonging to the housnumber in the address it will be here
If the address has a floor you can see it in here
If there is multiple placements on the floor this column will describe the placement of the leads address on the floor
a numeric code, that describes the address. The code is built this way: K:Municipality code (first 3 digits) V:Streetcode (Next 4 digits) H:Housenumber(Next 4 digits) X:Floor and placement description (Floor):(Next 2 digits) (Placement):(Last digits)
The code is built like this: KKKVVVVHHHHEESSSS
Rules: municipalitycode can only be three digits long Streetcode can only be 4 digits long Housenumber can only be 4 digits long Floor can only be 2 digits long Placement can only be 4 digits long
describes if we are allowed to call the person. 0/blank= there is noone on this address registered on the "robinson-list" E.g. We are allowed to call the person 1= the person is registered on the "robinson-list" E.g. We are not allowed to call the person 2= match on partly the name and the address on the "robinson-list" e.g. we can call the person 3= match on the address on the "robinson-list" e.g. we can call the person
Primary phonenumber
Secondary Phonenumber
Tertiary phonenumber
fourth phonenumber
fifth phonenumber
What type of phone is the primary phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the secondary phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the tertiary phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the fourth phonenumber: TLF=hardline mobil=Cellphone
What type of phone is the fifth phonenumber: TLF=hardline mobil=Cellphone
What gender does the lead presumeably have. M=Male K=Female Blank=Unknown the gender is estimatet based on the firstname
What age does the lead presumeably have. It ios based on the firstname, and a other variables which are unknown to us
Type of Mosaic_group that the leads belong to. Mosaic groups are a segmentet group of people who are believed to have certain age, gender, income, financial, family and educational backgrounds.
Type of Mosaic_type that the leads belong to. Mosaic types are a subgroup of mosaic_group and are a further segmentet group of people who are believed to have certain age, gender, income, financial, family and educational background.
probability of 0 kids in the household - based on clusterdata
probability of 1 kids in the household - based on clusterdata
probability of 2 kids in the household - based on clusterdata
probability of 3 or more kids in the household - based on clusterdata
The probability that the lead has completed bording-school
The probability that the lead have gone through highschool
The probability that the lead have gone through vocational high school
The probability that the lead have gone through a vocational course
The probability that the lead have gone through a short higher education
The probability that the lead have gone through a medium-term higher education
The probability that the lead have gone through a bachelor degree
The probability that the lead have gone through a higher education
The probability that the lead have a PhD
The probability that the lead have gone through an unknown educational background
The probability of the lead have a high or selfemployed occupational background
The probability of the lead have an intermediate occupational background
The probability of the lead have a basic occupational background
The probability of the lead have an unemployed occupational background
The probability of the lead have a pensioner occupational background
The probability of the lead have another occupational background
probability that the lead is not married
Probability that the leads is married
Probability that the lead is divorced
Probability that the lead is a widdow
financial capacity the order is decending where the top is the best: Hvid=White Grøn=Green Grå=Grey Gul=Yellow Rød=Red The lighter the financial capacity is, the better
estimated # of people living on the aforementioned address
average household income in DKK - based on clusterdata
Probability that the household is something other
Probability that the lead is a single parent who has one or more children
Probability that the lead is in a single adult who doesnt have children
Probability that the lead is in a couple who has one or more children
Probability that the lead is in a couple who doesnt have children
Probability for donation to charities - based on clusterdata
probability for donation. How probable is it that the lead will donate to charities - based on clusterdata
how is the house owned
Ejer=owner Lejer=tenant Andel=share Andet=other
How is the house used
Stuehus til landbrug=farmhouse for agriculture parcelhus=house række/kæde/dobbelthus=townhouse/Chainhouse/semi-detached house lejlighed= Apartment sommerhus= Summerhouse/vacationhouse andet=Other
How many rooms does the house/apartment have
data_response = pd.read_csv('Call_logs_done.csv',sep=';',encoding='iso-8859-1')
data_response.head()
tlfnr | call_ending_reason | date | time | |
---|---|---|---|---|
0 | NaN | NaN | 01-04-2019 | 12:18 |
1 | NaN | NaN | 01-04-2019 | 13:33 |
2 | NaN | NaN | 02-04-2019 | 09:35 |
3 | NaN | NaN | 02-04-2019 | 09:36 |
4 | NaN | NaN | 02-04-2019 | 09:41 |
data['call_ending_reason'] = data_response['call_ending_reason']
data.head()
baseid | fornavn | efternavn | adresse | stednavn | postnummer | bynavn | kommunekode | vej | husnr | ... | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 65536878 | ec107270-e365-41aa-82f7-ac7716e0a06f | 38c49621-2e54-4ae2-90ac-c54a47a657ec | Søbredden 24 | Svogerslev | 4000 | Roskilde | 265 | Søbredden | 24 | ... | 0 | 0 | 0,75 | 0 | 29.0 | 1196.0 | Ejer | Parcelhus | 5.0 | NaN |
1 | 63503287 | 5f229333-fe30-404d-a8f7-881d082fbd87 | 679d6544-5a01-4cb5-bc73-ccc936f50482 | Jacob Appels Alle 36 | NaN | 2770 | Kastrup | 185 | Jacob Appels Alle | 36 | ... | 0 | 0,121212121212121 | 0,757575757575758 | 0 | 35.0 | 569.0 | Ejer | Række/kæde/dobbelthus | 5.0 | NaN |
2 | 62199324 | 1a25f102-40f6-46c5-9ed7-f79259118aca | 0697dcfd-103d-454f-b8c4-f3e22abc6f68 | Højbovænge 8 | NaN | 4660 | Store Heddinge | 336 | Højbovænge | 8 | ... | 0,111111111111111 | 0,444444444444444 | 0,111111111111111 | 0,333333333333333 | 19.0 | 302.0 | Lejer | Række/kæde/dobbelthus | 3.0 | NaN |
3 | 64841017 | 348dc61c-209c-4b77-a3fe-9cde68ff65f6 | a18f669c-9bf6-498a-84bf-783a7b93b38d | Engdraget 2 | Gårslev | 7080 | Børkop | 630 | Engdraget | 2 | ... | 0 | 0 | 0,4 | 0,4 | 43.0 | 1747.0 | Ejer | Parcelhus | 7.0 | NaN |
4 | 10055668 | a64ff2ca-bb9d-46ab-8570-28bd39ebde27 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Irisvej 9 | NaN | 3300 | Frederiksværk | 260 | Irisvej | 9 | ... | 0 | 0 | 0,333333333333333 | 0,333333333333333 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | NaN |
5 rows × 67 columns
The only data that is useful to us is that which have a known and useful output. The calls that are not answered are not useful for us at all
filtered_data = data[data['call_ending_reason'].notnull()]
filtered_data.head(11)
baseid | fornavn | efternavn | adresse | stednavn | postnummer | bynavn | kommunekode | vej | husnr | ... | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 51934887 | fd0a8f24-2457-48b4-a7fe-fe0b95a3a81e | ba3df7f7-4943-4c68-b0bc-f606069e6d05 | Teglholt 12 | NaN | 6200 | Aabenraa | 580 | Teglholt | 12 | ... | 0 | 0,43 | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
22 | 11515649 | 15a1f0f6-b6e2-464a-8296-960659185184 | b3a586b3-4495-4ca5-a7ed-464508df504d | Sundvænget 32 | Dybbøl | 6400 | Sønderborg | 540 | Sundvænget | 32 | ... | 0 | 0,141414141414141 | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail |
83 | 12310066 | 4a679ecd-e896-4aad-b369-45f0164dc9e7 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Spøttrupvej 42, st tv | Tjørring | 7400 | Herning | 657 | Spøttrupvej | 42 | ... | 0 | 0,25 | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse |
645 | 30029973 | 36db5eaa-2531-453e-9bb8-3a5b47a9a029 | d196005f-0729-4222-9761-531ecc10c72d | Hjortøvænget 12 | Skærbæk | 7000 | Fredericia | 607 | Hjortøvænget | 12 | ... | 0,333333333333333 | 0 | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse |
646 | 54059038 | e9c577d1-417e-4584-ad74-232b3f7dd06c | df2cd3d3-f172-4a18-b64a-6dcc070b9aa4 | Engvænget 32 | NaN | 2650 | Hvidovre | 167 | Engvænget | 32 | ... | 0,09 | 0 | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
649 | 61722285 | 1a2c9fde-ac13-46ee-8723-f27534ef14dd | af3b5f8c-5ce9-41af-a09b-8b463596dec1 | Nannasvej 10 | NaN | 6100 | Haderslev | 510 | Nannasvej | 10 | ... | 0 | 0 | 0,2 | 0,8 | 25.0 | 401.0 | Ejer | Parcelhus | 6.0 | Røde Kors - Sikkerhedsmail |
661 | 63420003 | c10cc775-7071-4ce1-882f-00bb7add40b6 | 8b90aa84-a58e-4af9-8c25-c9d1324b372b | Kløvenhøj 24 | Ølby | 7600 | Struer | 671 | Kløvenhøj | 24 | ... | 0 | 0,2 | 0,4 | 0,4 | 0.0 | 0.0 | Ejer | Parcelhus | 5.0 | Børns vilkår - Sikkerhedsmail |
662 | 56894951 | 2d942692-9654-461f-9cd1-4c2f5aa70722 | 0697dcfd-103d-454f-b8c4-f3e22abc6f68 | Abelonelundvej 4 | Strib | 5500 | Middelfart | 410 | Abelonelundvej | 4 | ... | 0,11 | 0,11 | 0 | 0,78 | 0.0 | 0.0 | Ejer | Parcelhus | 6.0 | Børns vilkår - Sikkerhedsmail |
667 | 57749394 | c1980451-c102-49ee-998d-f8310b130836 | ca8cf5f7-6eff-4f98-840a-60baaa3679ab | Borkvej 7 | No | 6950 | Ringkøbing | 760 | Borkvej | 7 | ... | 0 | 0,4 | 0 | 0,4 | 0.0 | 0.0 | Ejer | Stuehus til landbrug | 4.0 | Røde Kors - Sikkerhedsmail |
668 | 63082476 | 28fd697d-28e8-4f90-b189-85f7ab464dac | e6afb89e-e441-41da-8276-ac49b7d88206 | Slimmingevej 27 | NaN | 4100 | Ringsted | 259 | Slimmingevej | 27 | ... | 0 | 0 | 0,2 | 0,6 | 0.0 | 0.0 | Ejer | Stuehus til landbrug | 8.0 | Røde Kors - Sikkerhedsmail |
669 | 63217826 | 015d905c-07a5-45c1-bb3a-6dbfa91b607f | 64701ffb-883b-4ac0-85f4-e83ee973c82e | Svalevej 5 | Horbelev | 4871 | Horbelev | 376 | Svalevej | 5 | ... | 0,0808080808080808 | 0,585858585858586 | 0 | 0,333333333333333 | NaN | NaN | Selskab | Række/kæde/dobbelthus | 4.0 | Børns vilkår - Sikkerhedsmail |
11 rows × 67 columns
del filtered_data['baseid']
del filtered_data['fornavn']
del filtered_data['efternavn']
del filtered_data['adresse']
del filtered_data['vej']
del filtered_data['husnr']
del filtered_data['bogstav']
del filtered_data['sal']
del filtered_data['side']
del filtered_data['kvhx']
del filtered_data['tlfnr1']
del filtered_data['tlfnr2']
del filtered_data['tlfnr3']
del filtered_data['tlfnr4']
del filtered_data['tlfnr5']
filtered_data.head()
stednavn | postnummer | bynavn | kommunekode | robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | ... | ssh_enlig_m_born | ssh_enlig_u_born | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | NaN | 6200 | Aabenraa | 580 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,43 | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
22 | Dybbøl | 6400 | Sønderborg | 540 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,141414141414141 | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail |
83 | Tjørring | 7400 | Herning | 657 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,25 | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse |
645 | Skærbæk | 7000 | Fredericia | 607 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,333333333333333 | 0 | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse |
646 | NaN | 2650 | Hvidovre | 167 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,09 | 0 | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail |
5 rows × 52 columns
import pgeocode
# We are targetiing only denmark postal codes
nomi = pgeocode.Nominatim('dk')
def convert_post_to_lat(post_number):
data_frame = nomi.query_postal_code(str(post_number))
return data_frame['latitude']
def convert_post_to_long(post_number):
data_frame = nomi.query_postal_code(str(post_number))
return data_frame['longitude']
post_numbers = filtered_data['postnummer'].tolist()
post_numbers = map(str, post_numbers)
query = nomi.query_postal_code(post_numbers)
filtered_data['latitude'] = query['latitude']
filtered_data['longitude'] = query['longitude']
C:\Users\Bruger\AppData\Local\Temp\ipykernel_14744\3412094935.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data['latitude'] = query['latitude'] C:\Users\Bruger\AppData\Local\Temp\ipykernel_14744\3412094935.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data['longitude'] = query['longitude']
import pickle
filtered_data.to_pickle('filtered.zip')
filtered_data.head()
stednavn | postnummer | bynavn | kommunekode | robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | ... | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | NaN | 6200 | Aabenraa | 580 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.4442 | 11.8065 |
22 | Dybbøl | 6400 | Sønderborg | 540 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail | 55.6602 | 11.4104 |
83 | Tjørring | 7400 | Herning | 657 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse | 56.1772 | 9.7805 |
645 | Skærbæk | 7000 | Fredericia | 607 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse | 56.0337 | 12.5881 |
646 | NaN | 2650 | Hvidovre | 167 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | ... | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.9281 | 11.6393 |
5 rows × 54 columns
stednavn
postnummer
bynavn
del filtered_data['stednavn']
del filtered_data['postnummer']
del filtered_data['bynavn']
del filtered_data['kommunekode']
filtered_data.head()
robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | koen | alder | mosaic_gruppe | mosaic_type | ... | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 76.0 | J | J35 | ... | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.4442 | 11.8065 |
22 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 61.0 | B | B03 | ... | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail | 55.6602 | 11.4104 |
83 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 74.0 | L | L44 | ... | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse | 56.1772 | 9.7805 |
645 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 39.0 | F | F20 | ... | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse | 56.0337 | 12.5881 |
646 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 66.0 | B | B05 | ... | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.9281 | 11.6393 |
5 rows × 50 columns
filtered_data = filtered_data.reset_index(drop=True)
filtered_data.head()
robinson | tlftype1 | tlftype2 | tlftype3 | tlftype4 | tlftype5 | koen | alder | mosaic_gruppe | mosaic_type | ... | ssh_par_m_born | ssh_par_u_born | donation_ssh | donation_gns | ejerforhold | enhedsanvendelse | antal vaerelser | call_ending_reason | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 76.0 | J | J35 | ... | 0 | 0,57 | 25.0 | 294.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.4442 | 11.8065 |
1 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 61.0 | B | B03 | ... | 0,141414141414141 | 0,717171717171717 | 0.0 | 0.0 | Ejer | Parcelhus | 4.0 | call_reason_voicemail | 55.6602 | 11.4104 |
2 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 74.0 | L | L44 | ... | 0 | 0,5 | 27.0 | 421.0 | Ejer | Lejlighed | 5.0 | Ingen Interesse | 56.1772 | 9.7805 |
3 | 0 | MOBIL | MOBIL | NaN | NaN | NaN | M | 39.0 | F | F20 | ... | 0,333333333333333 | 0,333333333333333 | 13.0 | 13.0 | Ejer | Parcelhus | 5.0 | Ingen Interesse | 56.0337 | 12.5881 |
4 | 1 | MOBIL | MOBIL | NaN | NaN | NaN | M | 66.0 | B | B05 | ... | 0,18 | 0,73 | 47.0 | 1416.0 | Ejer | Række/kæde/dobbelthus | 4.0 | call_reason_voicemail | 55.9281 | 11.6393 |
5 rows × 50 columns
for column in filtered_data.columns:
print('Total number of unique entities in ' +str(column)+ ' Column is == ' + str(len(filtered_data[column].unique())))
Total number of unique entities in robinson Column is == 4 Total number of unique entities in tlftype1 Column is == 3 Total number of unique entities in tlftype2 Column is == 3 Total number of unique entities in tlftype3 Column is == 3 Total number of unique entities in tlftype4 Column is == 3 Total number of unique entities in tlftype5 Column is == 3 Total number of unique entities in koen Column is == 3 Total number of unique entities in alder Column is == 81 Total number of unique entities in mosaic_gruppe Column is == 13 Total number of unique entities in mosaic_type Column is == 45 Total number of unique entities in ssh_0_born Column is == 348 Total number of unique entities in ssh_1_barn Column is == 231 Total number of unique entities in ssh_2_born Column is == 252 Total number of unique entities in ssh_3_plus_born Column is == 408 Total number of unique entities in udd_grundskole Column is == 2336 Total number of unique entities in udd_almen_gymnasial Column is == 2284 Total number of unique entities in udd_erhvervsgymasial Column is == 2 Total number of unique entities in udd_erhvervsfaglig_forloeb Column is == 2856 Total number of unique entities in udd_kort_videregaaende Column is == 1365 Total number of unique entities in udd_mellemlang_videregaaende Column is == 2533 Total number of unique entities in udd_bachelor Column is == 1536 Total number of unique entities in udd_lang_videregaaende Column is == 1231 Total number of unique entities in udd_forsker Column is == 1231 Total number of unique entities in udd_uoplyst Column is == 1141 Total number of unique entities in socio_high_selvst Column is == 782 Total number of unique entities in socio_mellemniveau Column is == 252 Total number of unique entities in socio_grundniveau Column is == 617 Total number of unique entities in socio_ledig_kontant Column is == 338 Total number of unique entities in socio_pensionist Column is == 890 Total number of unique entities in socio_other Column is == 443 Total number of unique entities in civilstand_ugift Column is == 323 Total number of unique entities in civilstand_gift Column is == 309 Total number of unique entities in civilstand_skilt Column is == 249 Total number of unique entities in civilstand_enke Column is == 285 Total number of unique entities in okonomisk_formaaen Column is == 6 Total number of unique entities in antal_beboere Column is == 84 Total number of unique entities in husstandsindkomst Column is == 285645 Total number of unique entities in ssh_anden_hustype Column is == 280 Total number of unique entities in ssh_enlig_m_born Column is == 245 Total number of unique entities in ssh_enlig_u_born Column is == 389 Total number of unique entities in ssh_par_m_born Column is == 333 Total number of unique entities in ssh_par_u_born Column is == 333 Total number of unique entities in donation_ssh Column is == 91 Total number of unique entities in donation_gns Column is == 7769 Total number of unique entities in ejerforhold Column is == 7 Total number of unique entities in enhedsanvendelse Column is == 7 Total number of unique entities in antal vaerelser Column is == 104 Total number of unique entities in call_ending_reason Column is == 68 Total number of unique entities in latitude Column is == 785 Total number of unique entities in longitude Column is == 913
print(filtered_data['udd_erhvervsgymasial'].unique())
[ 0. nan]
# Removing this column as all the information inside is useless for us it only containes 0 an nan for us
del filtered_data['udd_erhvervsgymasial']
One hot encoding of the features that are continous i.e. probbaiities is never made
columns_list = ['tlftype1', 'tlftype2', 'tlftype3','tlftype4','tlftype5','koen','mosaic_gruppe','mosaic_type','ejerforhold','enhedsanvendelse','okonomisk_formaaen']
for column in columns_list:
one_hot = pd.get_dummies(filtered_data[str(column)],prefix=column,prefix_sep='_')
# Drop column B as it is now encoded
del filtered_data[str(column)]
# Join the encoded df
filtered_data = filtered_data.join(one_hot)
filtered_data.head()
robinson | alder | ssh_0_born | ssh_1_barn | ssh_2_born | ssh_3_plus_born | udd_grundskole | udd_almen_gymnasial | udd_erhvervsfaglig_forloeb | udd_kort_videregaaende | ... | enhedsanvendelse_Lejlighed | enhedsanvendelse_Parcelhus | enhedsanvendelse_Række/kæde/dobbelthus | enhedsanvendelse_Sommerhus | enhedsanvendelse_Stuehus til landbrug | okonomisk_formaaen_1. HVID | okonomisk_formaaen_2. GRØN | okonomisk_formaaen_3. GRÅ | okonomisk_formaaen_4. GUL | okonomisk_formaaen_5. RØD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 76.0 | 1 | 0 | 0 | 0 | 0 | 0,0909090909090909 | 0,272727272727273 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 1 | 61.0 | 0,75 | 0,25 | 0 | 0 | 0,12280701754386 | 0 | 0,254385964912281 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 74.0 | 1 | 0 | 0 | 0 | 0,2 | 0 | 0,6 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 39.0 | 0,4 | 0,4 | 0,2 | 0 | 0 | 0 | 0 | 0,166666666666667 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 1 | 66.0 | 0,73 | 0 | 0,18 | 0,09 | 0 | 0 | 0,28448275862069 | 0,0689655172413793 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 123 columns
cols = ['alder','ssh_0_born','ssh_1_barn','ssh_2_born','ssh_3_plus_born','udd_grundskole','udd_almen_gymnasial','udd_erhvervsfaglig_forloeb','udd_kort_videregaaende','udd_mellemlang_videregaaende','udd_bachelor','udd_lang_videregaaende','udd_forsker','udd_uoplyst','socio_high_selvst','socio_mellemniveau','socio_grundniveau','socio_ledig_kontant','socio_pensionist','socio_other','civilstand_ugift','civilstand_gift','civilstand_skilt','civilstand_enke','antal_beboere','husstandsindkomst','ssh_anden_hustype','ssh_enlig_m_born','ssh_enlig_u_born','ssh_par_m_born','ssh_par_u_born','donation_ssh','donation_gns','antal vaerelser']
for col in cols:
try:
filtered_data[col] = filtered_data[col].str.replace(',','.')
except:
print(col)
alder antal_beboere husstandsindkomst donation_ssh donation_gns antal vaerelser
filtered_data.head()
robinson | alder | ssh_0_born | ssh_1_barn | ssh_2_born | ssh_3_plus_born | udd_grundskole | udd_almen_gymnasial | udd_erhvervsfaglig_forloeb | udd_kort_videregaaende | ... | enhedsanvendelse_Lejlighed | enhedsanvendelse_Parcelhus | enhedsanvendelse_Række/kæde/dobbelthus | enhedsanvendelse_Sommerhus | enhedsanvendelse_Stuehus til landbrug | okonomisk_formaaen_1. HVID | okonomisk_formaaen_2. GRØN | okonomisk_formaaen_3. GRÅ | okonomisk_formaaen_4. GUL | okonomisk_formaaen_5. RØD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 76.0 | 1 | 0 | 0 | 0 | 0 | 0.0909090909090909 | 0.272727272727273 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 1 | 61.0 | 0.75 | 0.25 | 0 | 0 | 0.12280701754386 | 0 | 0.254385964912281 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 74.0 | 1 | 0 | 0 | 0 | 0.2 | 0 | 0.6 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 39.0 | 0.4 | 0.4 | 0.2 | 0 | 0 | 0 | 0 | 0.166666666666667 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 1 | 66.0 | 0.73 | 0 | 0.18 | 0.09 | 0 | 0 | 0.28448275862069 | 0.0689655172413793 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 123 columns
Exploding gradients are a problem where large error gradients accumulate and result in very large updates to neural network model weights during training.
filtered_data['husstandsindkomst']=(filtered_data['husstandsindkomst']-filtered_data['husstandsindkomst'].min())/(filtered_data['husstandsindkomst'].max()-filtered_data['husstandsindkomst'].min())
import pandas as pd
mapping_file = pd.read_csv('map.csv',encoding='iso-8859-1')
mapping_file.head()
call_ending_reason | meaning | Yes/no/unfinished | |
---|---|---|---|
0 | Ingen Interesse | No I will not donate | No |
1 | Vil Ikke Kontaktes | Do not want to be contacted again | No |
2 | FORKERT NUMMER | Wrong number/said person do not exist on this ... | No |
3 | Ugyldigt nummer (anvendes n?r nummer er forkert) | Wrong number/said person do not exist on this ... | No |
4 | Vil ikke udlevere BS-oplysninger | Do not want to give out direct debit informations | No |
def convert_to_int(x):
if x == 'No':
return 0
elif x == 'Yes':
return 1
elif x =='Unfinished':
return 2
mapping_file['Yes/no/unfinished'] = mapping_file['Yes/no/unfinished'].apply(lambda x : convert_to_int(x) )
mapping_file.head()
call_ending_reason | meaning | Yes/no/unfinished | |
---|---|---|---|
0 | Ingen Interesse | No I will not donate | 0 |
1 | Vil Ikke Kontaktes | Do not want to be contacted again | 0 |
2 | FORKERT NUMMER | Wrong number/said person do not exist on this ... | 0 |
3 | Ugyldigt nummer (anvendes n?r nummer er forkert) | Wrong number/said person do not exist on this ... | 0 |
4 | Vil ikke udlevere BS-oplysninger | Do not want to give out direct debit informations | 0 |
dictionary = {}
for string,integer in zip(mapping_file['call_ending_reason'],mapping_file['Yes/no/unfinished']):
dictionary[string] = integer
filtered_data['call_ending_reason'] = filtered_data['call_ending_reason'].map(dictionary)
x = filtered_data
y = filtered_data['call_ending_reason']
del x['call_ending_reason']
x = x.fillna(-1)
x.isnull().values.any()
y = y.fillna(0) #Empty reason should be zero
y.isnull().values.any()
False
# x.to_csv('x.csv', encoding='iso-8859-1')
# y.to_csv('y.csv', encoding='iso-8859-1')
# import numpy as np
# def clean_dataset(df):
# assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
# df.dropna(inplace=True)
# indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
# return df[indices_to_keep].astype(np.float64)
With my experience the fancy TSNE visualization for the analysis of the data can be sometimes deceieving mostly in real data problems. KNN also works by exploiting the relationship between the nieghbour. If the KNN performs good at some extent the features are represented in the higher dimesnional space that distinguishness can be made among them
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier()
# rfc.fit(x, y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.75)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='invscaling',learning_rate_init=0.0001, n_iter_no_change=50, max_iter=1000)
rfc.fit(X_train, y_train)
Iteration 1, loss = 0.64777083 Iteration 2, loss = 0.61998823 Iteration 3, loss = 0.60810727 Iteration 4, loss = 0.61431193 Iteration 5, loss = 0.61331521 Iteration 6, loss = 0.60753339 Iteration 7, loss = 0.60393861 Iteration 8, loss = 0.60279712 Iteration 9, loss = 0.60324869 Iteration 10, loss = 0.60323766 Iteration 11, loss = 0.59907949 Iteration 12, loss = 0.59671948 Iteration 13, loss = 0.59781176 Iteration 14, loss = 0.59457808 Iteration 15, loss = 0.59430114 Iteration 16, loss = 0.59218180 Iteration 17, loss = 0.58931528 Iteration 18, loss = 0.59039334 Iteration 19, loss = 0.58869816 Iteration 20, loss = 0.58951301 Iteration 21, loss = 0.58711687 Iteration 22, loss = 0.58615668 Iteration 23, loss = 0.58588281 Iteration 24, loss = 0.58521278 Iteration 25, loss = 0.58494883 Iteration 26, loss = 0.58354615 Iteration 27, loss = 0.58244103 Iteration 28, loss = 0.58245776 Iteration 29, loss = 0.58251832 Iteration 30, loss = 0.58112408 Iteration 31, loss = 0.58225162 Iteration 32, loss = 0.58080646 Iteration 33, loss = 0.57989833 Iteration 34, loss = 0.58019845 Iteration 35, loss = 0.57842373 Iteration 36, loss = 0.57854850 Iteration 37, loss = 0.57762902 Iteration 38, loss = 0.57767142 Iteration 39, loss = 0.57775166 Iteration 40, loss = 0.57662654 Iteration 41, loss = 0.57673124 Iteration 42, loss = 0.57646452 Iteration 43, loss = 0.57658903 Iteration 44, loss = 0.57624564 Iteration 45, loss = 0.57610325 Iteration 46, loss = 0.57599913 Iteration 47, loss = 0.57561941 Iteration 48, loss = 0.57553930 Iteration 49, loss = 0.57554522 Iteration 50, loss = 0.57534226 Iteration 51, loss = 0.57525283 Iteration 52, loss = 0.57519467 Iteration 53, loss = 0.57510471 Iteration 54, loss = 0.57510156 Iteration 55, loss = 0.57499211 Iteration 56, loss = 0.57496538 Iteration 57, loss = 0.57493708 Iteration 58, loss = 0.57495190 Iteration 59, loss = 0.57488816 Iteration 60, loss = 0.57488638 Iteration 61, loss = 0.57479721 Iteration 62, loss = 0.57473223 Iteration 63, loss = 0.57475108 Iteration 64, loss = 0.57467829 Iteration 65, loss = 0.57470547 Iteration 66, loss = 0.57458585 Iteration 67, loss = 0.57466983 Iteration 68, loss = 0.57460967 Iteration 69, loss = 0.57462100 Iteration 70, loss = 0.57451775 Iteration 71, loss = 0.57443434 Iteration 72, loss = 0.57446263 Iteration 73, loss = 0.57445124 Iteration 74, loss = 0.57439550 Iteration 75, loss = 0.57438002 Iteration 76, loss = 0.57430134 Iteration 77, loss = 0.57432144 Iteration 78, loss = 0.57429302 Iteration 79, loss = 0.57431137 Iteration 80, loss = 0.57428483 Iteration 81, loss = 0.57421079 Iteration 82, loss = 0.57417083 Iteration 83, loss = 0.57413732 Iteration 84, loss = 0.57408192 Iteration 85, loss = 0.57407828 Iteration 86, loss = 0.57405153 Iteration 87, loss = 0.57403162 Iteration 88, loss = 0.57399044 Iteration 89, loss = 0.57395314 Iteration 90, loss = 0.57394180 Iteration 91, loss = 0.57392033 Iteration 92, loss = 0.57387986 Iteration 93, loss = 0.57384104 Iteration 94, loss = 0.57378687 Iteration 95, loss = 0.57375078 Iteration 96, loss = 0.57373843 Iteration 97, loss = 0.57370981 Iteration 98, loss = 0.57367616 Iteration 99, loss = 0.57365536 Iteration 100, loss = 0.57362819 Iteration 101, loss = 0.57357638 Iteration 102, loss = 0.57357717 Iteration 103, loss = 0.57352019 Iteration 104, loss = 0.57352027 Iteration 105, loss = 0.57347332 Iteration 106, loss = 0.57340626 Training loss did not improve more than tol=0.000100 for 50 consecutive epochs. Stopping.
MLPClassifier(hidden_layer_sizes=(100, 50, 10), learning_rate='invscaling', learning_rate_init=0.0001, max_iter=1000, n_iter_no_change=50, verbose=True)
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))
# from sklearn.metrics import classification_report
# print(classification_report(y[5000:20000], rfc.predict(x[5000:20000])))
precision recall f1-score support 0.0 0.12 0.00 0.00 57358 1.0 0.00 0.00 0.00 6881 2.0 0.80 1.00 0.89 251386 accuracy 0.80 315625 macro avg 0.31 0.33 0.30 315625 weighted avg 0.66 0.80 0.71 315625
x.head()
robinson | alder | ssh_0_born | ssh_1_barn | ssh_2_born | ssh_3_plus_born | udd_grundskole | udd_almen_gymnasial | udd_erhvervsfaglig_forloeb | udd_kort_videregaaende | ... | enhedsanvendelse_Lejlighed | enhedsanvendelse_Parcelhus | enhedsanvendelse_Række/kæde/dobbelthus | enhedsanvendelse_Sommerhus | enhedsanvendelse_Stuehus til landbrug | okonomisk_formaaen_1. HVID | okonomisk_formaaen_2. GRØN | okonomisk_formaaen_3. GRÅ | okonomisk_formaaen_4. GUL | okonomisk_formaaen_5. RØD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 76.0 | 1 | 0 | 0 | 0 | 0 | 0.0909090909090909 | 0.272727272727273 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 1 | 61.0 | 0.75 | 0.25 | 0 | 0 | 0.12280701754386 | 0 | 0.254385964912281 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 74.0 | 1 | 0 | 0 | 0 | 0.2 | 0 | 0.6 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 39.0 | 0.4 | 0.4 | 0.2 | 0 | 0 | 0 | 0 | 0.166666666666667 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 1 | 66.0 | 0.73 | 0 | 0.18 | 0.09 | 0 | 0 | 0.28448275862069 | 0.0689655172413793 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 122 columns
y.value_counts()
2.0 1005293 0.0 229547 1.0 27659 Name: call_ending_reason, dtype: int64
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
label = ['No', 'Yes', 'Unfinished']
number = [229106,27659,1005734]
ax.bar(label,number)
plt.show()
An imbalanced classification problem is an example of a classification problem where the distribution of examples across the known classes is biased or skewed. ... Many real-world classification problems have an imbalanced class distribution, such as fraud detection, spam detection, and churn prediction.
We are going try to downsample the majority class
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
df_balanced, balanced_labels = rus.fit_resample(x, y)
balanced_labels.value_counts()
0.0 27659 1.0 27659 2.0 27659 Name: call_ending_reason, dtype: int64
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
label = ['No', 'Yes', 'Unfished']
number = [27659,27659,27659]
ax.bar(label,number)
plt.show()
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(df_balanced, balanced_labels)
from sklearn.metrics import classification_report
print(classification_report(balanced_labels, rfc.predict(df_balanced)))
precision recall f1-score support 0.0 1.00 1.00 1.00 27659 1.0 1.00 1.00 1.00 27659 2.0 1.00 1.00 1.00 27659 accuracy 1.00 82977 macro avg 1.00 1.00 1.00 82977 weighted avg 1.00 1.00 1.00 82977
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.99)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='constant',learning_rate_init=0.0005, n_iter_no_change=50, max_iter=1000)
# rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))
Iteration 1, loss = 1.32958292 Iteration 2, loss = 1.17382379 Iteration 3, loss = 1.17798205 Iteration 4, loss = 1.16973133 Iteration 5, loss = 1.16275242 Iteration 6, loss = 1.14890057 Iteration 7, loss = 1.14993119 Iteration 8, loss = 1.13621909 Iteration 9, loss = 1.15393737 Iteration 10, loss = 1.12756345 Iteration 11, loss = 1.13597190 Iteration 12, loss = 1.10771288 Iteration 13, loss = 1.11449666 Iteration 14, loss = 1.10836334 Iteration 15, loss = 1.10619507 Iteration 16, loss = 1.10711457 Iteration 17, loss = 1.11136752 Iteration 18, loss = 1.10664509 Iteration 19, loss = 1.10600425 Iteration 20, loss = 1.10593425 Iteration 21, loss = 1.10258331 Iteration 22, loss = 1.10134637 Iteration 23, loss = 1.10732109 Iteration 24, loss = 1.09940176 Iteration 25, loss = 1.10047125 Iteration 26, loss = 1.10109537 Iteration 27, loss = 1.09928363 Iteration 28, loss = 1.10029805 Iteration 29, loss = 1.09909523 Iteration 30, loss = 1.10152546 Iteration 31, loss = 1.09757330 Iteration 32, loss = 1.09762873 Iteration 33, loss = 1.09648323 Iteration 34, loss = 1.09662355 Iteration 35, loss = 1.09694548 Iteration 36, loss = 1.09673984 Iteration 37, loss = 1.09683113 Iteration 38, loss = 1.09630211 Iteration 39, loss = 1.09595167 Iteration 40, loss = 1.09571349 Iteration 41, loss = 1.09546417 Iteration 42, loss = 1.09587294 Iteration 43, loss = 1.09555470 Iteration 44, loss = 1.09488909 Iteration 45, loss = 1.09482741 Iteration 46, loss = 1.09468202 Iteration 47, loss = 1.09473918 Iteration 48, loss = 1.09442563 Iteration 49, loss = 1.09419259 Iteration 50, loss = 1.09404434 Iteration 51, loss = 1.09387755 Iteration 52, loss = 1.09365237 Iteration 53, loss = 1.09344749 Iteration 54, loss = 1.09323214 Iteration 55, loss = 1.09300811 Iteration 56, loss = 1.09275027 Iteration 57, loss = 1.09262124 Iteration 58, loss = 1.09228070 Iteration 59, loss = 1.09187742 Iteration 60, loss = 1.09154762 Iteration 61, loss = 1.09135797 Iteration 62, loss = 1.09110705 Iteration 63, loss = 1.09098558 Iteration 64, loss = 1.09036456 Iteration 65, loss = 1.09035000 Iteration 66, loss = 1.08988133 Iteration 67, loss = 1.08957648 Iteration 68, loss = 1.08954219 Iteration 69, loss = 1.08857836 Iteration 70, loss = 1.08859061 Iteration 71, loss = 1.08829504 Iteration 72, loss = 1.08776913 Iteration 73, loss = 1.08747694 Iteration 74, loss = 1.08697770 Iteration 75, loss = 1.08680607 Iteration 76, loss = 1.08634044 Iteration 77, loss = 1.08565296 Iteration 78, loss = 1.08599048 Iteration 79, loss = 1.08563575 Iteration 80, loss = 1.08547957 Iteration 81, loss = 1.08480080 Iteration 82, loss = 1.08390291 Iteration 83, loss = 1.08351013 Iteration 84, loss = 1.08357633 Iteration 85, loss = 1.08286934 Iteration 86, loss = 1.08218604 Iteration 87, loss = 1.08181367 Iteration 88, loss = 1.08200830 Iteration 89, loss = 1.08133112 Iteration 90, loss = 1.08116489 Iteration 91, loss = 1.08061552 Iteration 92, loss = 1.07998952 Iteration 93, loss = 1.08079146 Iteration 94, loss = 1.07996682 Iteration 95, loss = 1.07870790 Iteration 96, loss = 1.07778636 Iteration 97, loss = 1.07790600 Iteration 98, loss = 1.07721476 Iteration 99, loss = 1.07660069 Iteration 100, loss = 1.07624284 Iteration 101, loss = 1.07654407 Iteration 102, loss = 1.07573565 Iteration 103, loss = 1.07521014 Iteration 104, loss = 1.07492561 Iteration 105, loss = 1.07399228 Iteration 106, loss = 1.07352169 Iteration 107, loss = 1.07347454 Iteration 108, loss = 1.07250182 Iteration 109, loss = 1.07181892 Iteration 110, loss = 1.07254877 Iteration 111, loss = 1.07124297 Iteration 112, loss = 1.07075382 Iteration 113, loss = 1.06995389 Iteration 114, loss = 1.06978551 Iteration 115, loss = 1.06912700 Iteration 116, loss = 1.06859849 Iteration 117, loss = 1.06816041 Iteration 118, loss = 1.06829347 Iteration 119, loss = 1.06718457 Iteration 120, loss = 1.06697756 Iteration 121, loss = 1.06723005 Iteration 122, loss = 1.06637848 Iteration 123, loss = 1.06606195 Iteration 124, loss = 1.06514628 Iteration 125, loss = 1.06414802 Iteration 126, loss = 1.06431648 Iteration 127, loss = 1.06409033 Iteration 128, loss = 1.06325656 Iteration 129, loss = 1.06297425 Iteration 130, loss = 1.06224040 Iteration 131, loss = 1.06168101 Iteration 132, loss = 1.06123806 Iteration 133, loss = 1.06104693 Iteration 134, loss = 1.06027743 Iteration 135, loss = 1.05993886 Iteration 136, loss = 1.05973855 Iteration 137, loss = 1.05878706 Iteration 138, loss = 1.05867528 Iteration 139, loss = 1.05804531 Iteration 140, loss = 1.05744906 Iteration 141, loss = 1.05691735 Iteration 142, loss = 1.05664131 Iteration 143, loss = 1.05577439 Iteration 144, loss = 1.05562281 Iteration 145, loss = 1.05543808 Iteration 146, loss = 1.05416183 Iteration 147, loss = 1.05462832 Iteration 148, loss = 1.05387215 Iteration 149, loss = 1.05297521 Iteration 150, loss = 1.05314814 Iteration 151, loss = 1.05103858 Iteration 152, loss = 1.05247854 Iteration 153, loss = 1.05133800 Iteration 154, loss = 1.05075880 Iteration 155, loss = 1.05031957 Iteration 156, loss = 1.04992163 Iteration 157, loss = 1.04930741 Iteration 158, loss = 1.04923314 Iteration 159, loss = 1.04835843 Iteration 160, loss = 1.04803262 Iteration 161, loss = 1.04810848 Iteration 162, loss = 1.04806761 Iteration 163, loss = 1.04712539 Iteration 164, loss = 1.04625580 Iteration 165, loss = 1.04681005 Iteration 166, loss = 1.04690796 Iteration 167, loss = 1.04533942 Iteration 168, loss = 1.04540972 Iteration 169, loss = 1.04426914 Iteration 170, loss = 1.04399191 Iteration 171, loss = 1.04484498 Iteration 172, loss = 1.04585781 Iteration 173, loss = 1.04395345 Iteration 174, loss = 1.04301116 Iteration 175, loss = 1.04160483 Iteration 176, loss = 1.04093849 Iteration 177, loss = 1.04111133 Iteration 178, loss = 1.04023517 Iteration 179, loss = 1.04072626 Iteration 180, loss = 1.04076880 Iteration 181, loss = 1.03976513 Iteration 182, loss = 1.03919063 Iteration 183, loss = 1.03980694 Iteration 184, loss = 1.03805745 Iteration 185, loss = 1.03775399 Iteration 186, loss = 1.03793278 Iteration 187, loss = 1.03809177 Iteration 188, loss = 1.03801989 Iteration 189, loss = 1.03712718 Iteration 190, loss = 1.03699491 Iteration 191, loss = 1.03595260 Iteration 192, loss = 1.03542557 Iteration 193, loss = 1.03665553 Iteration 194, loss = 1.03482278 Iteration 195, loss = 1.03473248 Iteration 196, loss = 1.03378772 Iteration 197, loss = 1.03349631 Iteration 198, loss = 1.03292589 Iteration 199, loss = 1.03294175 Iteration 200, loss = 1.03211726 Iteration 201, loss = 1.03269302 Iteration 202, loss = 1.03203076 Iteration 203, loss = 1.03118882 Iteration 204, loss = 1.03072580 Iteration 205, loss = 1.03075197 Iteration 206, loss = 1.02863130 Iteration 207, loss = 1.03045681 Iteration 208, loss = 1.02955150 Iteration 209, loss = 1.03046046 Iteration 210, loss = 1.02797217 Iteration 211, loss = 1.02807452 Iteration 212, loss = 1.02711934 Iteration 213, loss = 1.02751267 Iteration 214, loss = 1.02680380 Iteration 215, loss = 1.02638458 Iteration 216, loss = 1.02764123 Iteration 217, loss = 1.02665648 Iteration 218, loss = 1.02576724 Iteration 219, loss = 1.02587263 Iteration 220, loss = 1.02570350 Iteration 221, loss = 1.02574609 Iteration 222, loss = 1.02480034 Iteration 223, loss = 1.02535963 Iteration 224, loss = 1.02427491 Iteration 225, loss = 1.02439672 Iteration 226, loss = 1.02370632 Iteration 227, loss = 1.02240893 Iteration 228, loss = 1.02387538 Iteration 229, loss = 1.02147330 Iteration 230, loss = 1.02283700 Iteration 231, loss = 1.02360277 Iteration 232, loss = 1.02165348 Iteration 233, loss = 1.02191429 Iteration 234, loss = 1.02150599 Iteration 235, loss = 1.02107779 Iteration 236, loss = 1.02285793 Iteration 237, loss = 1.02218805 Iteration 238, loss = 1.02048871 Iteration 239, loss = 1.02097715 Iteration 240, loss = 1.01912654 Iteration 241, loss = 1.01947821 Iteration 242, loss = 1.01924574 Iteration 243, loss = 1.01797720 Iteration 244, loss = 1.01846717 Iteration 245, loss = 1.01823458 Iteration 246, loss = 1.01827314 Iteration 247, loss = 1.01772186 Iteration 248, loss = 1.01644207 Iteration 249, loss = 1.01711936 Iteration 250, loss = 1.01771949 Iteration 251, loss = 1.01974530 Iteration 252, loss = 1.02018019 Iteration 253, loss = 1.01764126 Iteration 254, loss = 1.01676338 Iteration 255, loss = 1.01625573 Iteration 256, loss = 1.01493192 Iteration 257, loss = 1.01600074 Iteration 258, loss = 1.01596189 Iteration 259, loss = 1.01449007 Iteration 260, loss = 1.01702424 Iteration 261, loss = 1.01406420 Iteration 262, loss = 1.01498624 Iteration 263, loss = 1.01389573 Iteration 264, loss = 1.01282257 Iteration 265, loss = 1.01407127 Iteration 266, loss = 1.01384982 Iteration 267, loss = 1.01525060 Iteration 268, loss = 1.01376363 Iteration 269, loss = 1.01213140 Iteration 270, loss = 1.01234962 Iteration 271, loss = 1.01285216 Iteration 272, loss = 1.01207405 Iteration 273, loss = 1.01253674 Iteration 274, loss = 1.01224531 Iteration 275, loss = 1.01232972 Iteration 276, loss = 1.01150781 Iteration 277, loss = 1.01176198 Iteration 278, loss = 1.01316537 Iteration 279, loss = 1.01068766 Iteration 280, loss = 1.00964816 Iteration 281, loss = 1.00863978 Iteration 282, loss = 1.01013200 Iteration 283, loss = 1.00909582 Iteration 284, loss = 1.00900545 Iteration 285, loss = 1.01111183 Iteration 286, loss = 1.01004057 Iteration 287, loss = 1.00782741 Iteration 288, loss = 1.00842926 Iteration 289, loss = 1.00779102 Iteration 290, loss = 1.00770760 Iteration 291, loss = 1.00771807 Iteration 292, loss = 1.00811901 Iteration 293, loss = 1.00800285 Iteration 294, loss = 1.00943394 Iteration 295, loss = 1.00755538 Iteration 296, loss = 1.00699696 Iteration 297, loss = 1.00553626 Iteration 298, loss = 1.00488759 Iteration 299, loss = 1.00773875 Iteration 300, loss = 1.00879360 Iteration 301, loss = 1.00621451 Iteration 302, loss = 1.00495112 Iteration 303, loss = 1.00635996 Iteration 304, loss = 1.00565570 Iteration 305, loss = 1.00577629 Iteration 306, loss = 1.00472289 Iteration 307, loss = 1.00446508 Iteration 308, loss = 1.00373593 Iteration 309, loss = 1.00273000 Iteration 310, loss = 1.00365872 Iteration 311, loss = 1.00434226 Iteration 312, loss = 1.00316744 Iteration 313, loss = 1.00428735 Iteration 314, loss = 1.00228985 Iteration 315, loss = 1.00293545 Iteration 316, loss = 1.00450845 Iteration 317, loss = 1.00211691 Iteration 318, loss = 1.00359696 Iteration 319, loss = 1.00662362 Iteration 320, loss = 1.00568749 Iteration 321, loss = 1.00521728 Iteration 322, loss = 1.00270617 Iteration 323, loss = 1.00110165 Iteration 324, loss = 1.00189616 Iteration 325, loss = 1.00268456 Iteration 326, loss = 1.00200870 Iteration 327, loss = 1.00083260 Iteration 328, loss = 1.00193683 Iteration 329, loss = 1.00178702 Iteration 330, loss = 1.00173154 Iteration 331, loss = 1.00564032 Iteration 332, loss = 1.00240442 Iteration 333, loss = 1.00037550 Iteration 334, loss = 0.99872209 Iteration 335, loss = 0.99817023 Iteration 336, loss = 0.99717233 Iteration 337, loss = 0.99930568 Iteration 338, loss = 0.99668355 Iteration 339, loss = 0.99922804 Iteration 340, loss = 0.99811519 Iteration 341, loss = 0.99931090 Iteration 342, loss = 0.99633424 Iteration 343, loss = 0.99835554 Iteration 344, loss = 1.00069208 Iteration 345, loss = 0.99718307 Iteration 346, loss = 0.99710537 Iteration 347, loss = 0.99629017 Iteration 348, loss = 0.99870676 Iteration 349, loss = 0.99698958 Iteration 350, loss = 0.99949066 Iteration 351, loss = 0.99586602 Iteration 352, loss = 0.99464350 Iteration 353, loss = 0.99682069 Iteration 354, loss = 0.99713876 Iteration 355, loss = 0.99594814 Iteration 356, loss = 0.99688282 Iteration 357, loss = 0.99680166 Iteration 358, loss = 0.99476145 Iteration 359, loss = 0.99425359 Iteration 360, loss = 0.99514110 Iteration 361, loss = 0.99564247 Iteration 362, loss = 0.99521346 Iteration 363, loss = 0.99454175 Iteration 364, loss = 0.99591524 Iteration 365, loss = 0.99180753 Iteration 366, loss = 0.99549016 Iteration 367, loss = 0.99452901 Iteration 368, loss = 0.99321095 Iteration 369, loss = 0.99465936 Iteration 370, loss = 0.99332963 Iteration 371, loss = 0.99203479 Iteration 372, loss = 0.99298977 Iteration 373, loss = 0.99343459 Iteration 374, loss = 0.99350083 Iteration 375, loss = 0.99250918 Iteration 376, loss = 0.99292451 Iteration 377, loss = 0.99425056 Iteration 378, loss = 0.99226672 Iteration 379, loss = 0.99160208 Iteration 380, loss = 0.99238268 Iteration 381, loss = 0.99237390 Iteration 382, loss = 0.99198590 Iteration 383, loss = 0.99131080 Iteration 384, loss = 0.99231095 Iteration 385, loss = 0.99035291 Iteration 386, loss = 0.99435888 Iteration 387, loss = 0.99368064 Iteration 388, loss = 0.99085815 Iteration 389, loss = 0.99128658 Iteration 390, loss = 0.98981218 Iteration 391, loss = 0.99363474 Iteration 392, loss = 0.98982299 Iteration 393, loss = 0.98931272 Iteration 394, loss = 0.99130481 Iteration 395, loss = 0.99152533 Iteration 396, loss = 0.98992946 Iteration 397, loss = 0.99084666 Iteration 398, loss = 0.98957042 Iteration 399, loss = 0.99238828 Iteration 400, loss = 0.98950466 Iteration 401, loss = 0.99284506 Iteration 402, loss = 0.99044378 Iteration 403, loss = 0.98902059 Iteration 404, loss = 0.98747316 Iteration 405, loss = 0.98862271 Iteration 406, loss = 0.98864331 Iteration 407, loss = 0.99007426 Iteration 408, loss = 0.98854933 Iteration 409, loss = 0.98813690 Iteration 410, loss = 0.98726740 Iteration 411, loss = 0.98705733 Iteration 412, loss = 0.98910878 Iteration 413, loss = 0.99068021 Iteration 414, loss = 0.98551926 Iteration 415, loss = 0.98743024 Iteration 416, loss = 0.99008202 Iteration 417, loss = 0.98562958 Iteration 418, loss = 0.99000531 Iteration 419, loss = 0.98564586 Iteration 420, loss = 0.98612097 Iteration 421, loss = 0.98576646 Iteration 422, loss = 0.98705841 Iteration 423, loss = 0.98635679 Iteration 424, loss = 0.98651843 Iteration 425, loss = 0.98555745 Iteration 426, loss = 0.98532474 Iteration 427, loss = 0.98608918 Iteration 428, loss = 0.98532699 Iteration 429, loss = 0.98696407 Iteration 430, loss = 0.98454131 Iteration 431, loss = 0.98533472 Iteration 432, loss = 0.98487191 Iteration 433, loss = 0.98504423 Iteration 434, loss = 0.98505397 Iteration 435, loss = 0.98584425 Iteration 436, loss = 0.98575716 Iteration 437, loss = 0.98530870 Iteration 438, loss = 0.98387779 Iteration 439, loss = 0.98507633 Iteration 440, loss = 0.98367948 Iteration 441, loss = 0.98884451 Iteration 442, loss = 0.98335899 Iteration 443, loss = 0.98416229 Iteration 444, loss = 0.98212860 Iteration 445, loss = 0.98464322 Iteration 446, loss = 0.98454897 Iteration 447, loss = 0.98657045 Iteration 448, loss = 0.98313984 Iteration 449, loss = 0.98232206 Iteration 450, loss = 0.98187532 Iteration 451, loss = 0.98931334 Iteration 452, loss = 0.98472889 Iteration 453, loss = 0.98194329 Iteration 454, loss = 0.98277759 Iteration 455, loss = 0.98148152 Iteration 456, loss = 0.98205693 Iteration 457, loss = 0.98547997 Iteration 458, loss = 0.98068267 Iteration 459, loss = 0.97953766 Iteration 460, loss = 0.98219997 Iteration 461, loss = 0.98178057 Iteration 462, loss = 0.98241998 Iteration 463, loss = 0.98002538 Iteration 464, loss = 0.98066519 Iteration 465, loss = 0.98198339 Iteration 466, loss = 0.98385357 Iteration 467, loss = 0.98188733 Iteration 468, loss = 0.97955240 Iteration 469, loss = 0.98084324 Iteration 470, loss = 0.98114757 Iteration 471, loss = 0.98100944 Iteration 472, loss = 0.98046220 Iteration 473, loss = 0.98422974 Iteration 474, loss = 0.98271314 Iteration 475, loss = 0.97986761 Iteration 476, loss = 0.97912511 Iteration 477, loss = 0.98051722 Iteration 478, loss = 0.98089892 Iteration 479, loss = 0.98083700 Iteration 480, loss = 0.97855722 Iteration 481, loss = 0.97936555 Iteration 482, loss = 0.98207182 Iteration 483, loss = 0.97861374 Iteration 484, loss = 0.97780702 Iteration 485, loss = 0.98061196 Iteration 486, loss = 0.98031000 Iteration 487, loss = 0.98011303 Iteration 488, loss = 0.97930027 Iteration 489, loss = 0.98061772 Iteration 490, loss = 0.97811784 Iteration 491, loss = 0.97886284 Iteration 492, loss = 0.97761745 Iteration 493, loss = 0.97666781 Iteration 494, loss = 0.97809086 Iteration 495, loss = 0.97693711 Iteration 496, loss = 0.98232263 Iteration 497, loss = 0.97915935 Iteration 498, loss = 0.98038072 Iteration 499, loss = 0.97758434 Iteration 500, loss = 0.97745351 Iteration 501, loss = 0.97570297 Iteration 502, loss = 0.97931092 Iteration 503, loss = 0.97829706 Iteration 504, loss = 0.97624235 Iteration 505, loss = 0.97700138 Iteration 506, loss = 0.97517153 Iteration 507, loss = 0.97732112 Iteration 508, loss = 0.97884351 Iteration 509, loss = 0.97843165 Iteration 510, loss = 0.97635112 Iteration 511, loss = 0.97532810 Iteration 512, loss = 0.97724079 Iteration 513, loss = 0.97580934 Iteration 514, loss = 0.97409689 Iteration 515, loss = 0.97674897 Iteration 516, loss = 0.97580840 Iteration 517, loss = 0.97531113 Iteration 518, loss = 0.97500212 Iteration 519, loss = 0.97815247 Iteration 520, loss = 0.97320555 Iteration 521, loss = 0.97675637 Iteration 522, loss = 0.97714358 Iteration 523, loss = 0.97574527 Iteration 524, loss = 0.97560734 Iteration 525, loss = 0.97541665 Iteration 526, loss = 0.97449760 Iteration 527, loss = 0.97637487 Iteration 528, loss = 0.97492094 Iteration 529, loss = 0.97482207 Iteration 530, loss = 0.97452260 Iteration 531, loss = 0.99339122 Iteration 532, loss = 0.97852285 Iteration 533, loss = 0.97602815 Iteration 534, loss = 0.97308232 Iteration 535, loss = 0.97683613 Iteration 536, loss = 0.97130142 Iteration 537, loss = 0.97267021 Iteration 538, loss = 0.97205517 Iteration 539, loss = 0.97416102 Iteration 540, loss = 0.97423532 Iteration 541, loss = 0.97322640 Iteration 542, loss = 0.97463237 Iteration 543, loss = 0.97564325 Iteration 544, loss = 0.97362061 Iteration 545, loss = 0.97230006 Iteration 546, loss = 0.97370073 Iteration 547, loss = 0.98103609 Iteration 548, loss = 0.97402316 Iteration 549, loss = 0.97435897 Iteration 550, loss = 0.97307889 Iteration 551, loss = 0.97462841 Iteration 552, loss = 0.97274508 Iteration 553, loss = 0.97186930 Iteration 554, loss = 0.97407266 Iteration 555, loss = 0.97294868 Iteration 556, loss = 0.97337304 Iteration 557, loss = 0.97276428 Iteration 558, loss = 0.98550940 Iteration 559, loss = 0.97913462 Iteration 560, loss = 0.97257470 Iteration 561, loss = 0.97085040 Iteration 562, loss = 0.97098808 Iteration 563, loss = 0.97394868 Iteration 564, loss = 0.97275056 Iteration 565, loss = 0.97116302 Iteration 566, loss = 0.97049549 Iteration 567, loss = 0.97118758 Iteration 568, loss = 0.97439034 Iteration 569, loss = 0.97067366 Iteration 570, loss = 0.97160445 Iteration 571, loss = 0.97148313 Iteration 572, loss = 0.97071143 Iteration 573, loss = 0.97221332 Iteration 574, loss = 0.97117245 Iteration 575, loss = 0.97011267 Iteration 576, loss = 0.97223107 Iteration 577, loss = 0.97211199 Iteration 578, loss = 0.97043939 Iteration 579, loss = 0.97318721 Iteration 580, loss = 0.97030344 Iteration 581, loss = 0.96877787 Iteration 582, loss = 0.97008379 Iteration 583, loss = 0.97243644 Iteration 584, loss = 0.97359353 Iteration 585, loss = 0.97478189 Iteration 586, loss = 0.96726432 Iteration 587, loss = 0.96682858 Iteration 588, loss = 0.97061515 Iteration 589, loss = 0.97081509 Iteration 590, loss = 0.97017221 Iteration 591, loss = 0.96988694 Iteration 592, loss = 0.96767235 Iteration 593, loss = 0.96799037 Iteration 594, loss = 0.96983270 Iteration 595, loss = 0.96792479 Iteration 596, loss = 0.98373754 Iteration 597, loss = 0.97083876 Iteration 598, loss = 0.96608976 Iteration 599, loss = 0.96927340 Iteration 600, loss = 0.96776851 Iteration 601, loss = 0.96806292 Iteration 602, loss = 0.96898341 Iteration 603, loss = 0.96807147 Iteration 604, loss = 0.96903525 Iteration 605, loss = 0.96715067 Iteration 606, loss = 0.96936257 Iteration 607, loss = 0.96604903 Iteration 608, loss = 0.96792325 Iteration 609, loss = 0.96698188 Iteration 610, loss = 0.97183809 Iteration 611, loss = 0.96877588 Iteration 612, loss = 0.96854961 Iteration 613, loss = 0.96716663 Iteration 614, loss = 0.96497956 Iteration 615, loss = 0.97260538 Iteration 616, loss = 0.96431293 Iteration 617, loss = 0.96901245 Iteration 618, loss = 0.96845091 Iteration 619, loss = 0.96609581 Iteration 620, loss = 0.96680235 Iteration 621, loss = 0.96632626 Iteration 622, loss = 0.96656775 Iteration 623, loss = 0.96741559 Iteration 624, loss = 0.96812009 Iteration 625, loss = 0.96544124 Iteration 626, loss = 0.96986298 Iteration 627, loss = 0.96383274 Iteration 628, loss = 0.96681138 Iteration 629, loss = 0.96799711 Iteration 630, loss = 0.96711364 Iteration 631, loss = 0.96576130 Iteration 632, loss = 0.96603035 Iteration 633, loss = 0.96426974 Iteration 634, loss = 0.96616267 Iteration 635, loss = 0.97200516 Iteration 636, loss = 0.96931439 Iteration 637, loss = 0.96380767 Iteration 638, loss = 0.96275017 Iteration 639, loss = 0.96468921 Iteration 640, loss = 0.96522451 Iteration 641, loss = 0.97078981 Iteration 642, loss = 0.96654920 Iteration 643, loss = 0.96457088 Iteration 644, loss = 0.96435877 Iteration 645, loss = 0.96673729 Iteration 646, loss = 0.96728273 Iteration 647, loss = 0.97030719 Iteration 648, loss = 0.96423510 Iteration 649, loss = 0.96192855 Iteration 650, loss = 0.96395923 Iteration 651, loss = 0.96488427 Iteration 652, loss = 0.96992149 Iteration 653, loss = 0.96534997 Iteration 654, loss = 0.96406104 Iteration 655, loss = 0.96643988 Iteration 656, loss = 0.96371438 Iteration 657, loss = 0.96571433 Iteration 658, loss = 0.96739554 Iteration 659, loss = 0.96327486 Iteration 660, loss = 0.96256369 Iteration 661, loss = 0.96665455 Iteration 662, loss = 0.96217797 Iteration 663, loss = 0.96300894 Iteration 664, loss = 0.96923877 Iteration 665, loss = 0.96514320 Iteration 666, loss = 0.96616411 Iteration 667, loss = 0.96495448 Iteration 668, loss = 0.96436207 Iteration 669, loss = 0.96229918 Iteration 670, loss = 0.96225512 Iteration 671, loss = 0.96514793 Iteration 672, loss = 0.96256004 Iteration 673, loss = 0.96408105 Iteration 674, loss = 0.96479374 Iteration 675, loss = 0.96331645 Iteration 676, loss = 0.96120422 Iteration 677, loss = 0.96121228 Iteration 678, loss = 0.96168957 Iteration 679, loss = 0.96491569 Iteration 680, loss = 0.96250787 Iteration 681, loss = 0.96179837 Iteration 682, loss = 0.96301576 Iteration 683, loss = 0.96228321 Iteration 684, loss = 0.96247492 Iteration 685, loss = 0.96300686 Iteration 686, loss = 0.96103285 Iteration 687, loss = 0.96197987 Iteration 688, loss = 0.96230110 Iteration 689, loss = 0.96038260 Iteration 690, loss = 0.96125040 Iteration 691, loss = 0.96051707 Iteration 692, loss = 0.96178429 Iteration 693, loss = 0.96210895 Iteration 694, loss = 0.96307371 Iteration 695, loss = 0.96134160 Iteration 696, loss = 0.96561070 Iteration 697, loss = 0.96169933 Iteration 698, loss = 0.95993460 Iteration 699, loss = 0.96537400 Iteration 700, loss = 0.96181136 Iteration 701, loss = 0.96057084 Iteration 702, loss = 0.95981215 Iteration 703, loss = 0.95988821 Iteration 704, loss = 0.96049978 Iteration 705, loss = 0.96024194 Iteration 706, loss = 0.95868047 Iteration 707, loss = 0.96407473 Iteration 708, loss = 0.96446380 Iteration 709, loss = 0.96637441 Iteration 710, loss = 0.96062646 Iteration 711, loss = 0.96165940 Iteration 712, loss = 0.95825928 Iteration 713, loss = 0.96007933 Iteration 714, loss = 0.95845648 Iteration 715, loss = 0.96100023 Iteration 716, loss = 0.96175926 Iteration 717, loss = 0.96089884 Iteration 718, loss = 0.96324838 Iteration 719, loss = 0.95827047 Iteration 720, loss = 0.95734903 Iteration 721, loss = 0.95964490 Iteration 722, loss = 0.96304779 Iteration 723, loss = 0.96033898 Iteration 724, loss = 0.95805705 Iteration 725, loss = 0.96369232 Iteration 726, loss = 0.96362911 Iteration 727, loss = 0.95872622 Iteration 728, loss = 0.96212201 Iteration 729, loss = 0.96047286 Iteration 730, loss = 0.95981492 Iteration 731, loss = 0.96088052 Iteration 732, loss = 0.95978449 Iteration 733, loss = 0.95936618 Iteration 734, loss = 0.95891941 Iteration 735, loss = 0.96163373 Iteration 736, loss = 0.96395609 Iteration 737, loss = 0.95746241 Iteration 738, loss = 0.95611409 Iteration 739, loss = 0.95731304 Iteration 740, loss = 0.95878806 Iteration 741, loss = 0.96583497 Iteration 742, loss = 0.96090425 Iteration 743, loss = 0.96025153 Iteration 744, loss = 0.95786941 Iteration 745, loss = 0.95621671 Iteration 746, loss = 0.95935992 Iteration 747, loss = 0.95691073 Iteration 748, loss = 0.95757957 Iteration 749, loss = 0.96271971 Iteration 750, loss = 0.96065024 Iteration 751, loss = 0.95765979 Iteration 752, loss = 0.95824396 Iteration 753, loss = 0.95941396 Iteration 754, loss = 0.96016996 Iteration 755, loss = 0.96405551 Iteration 756, loss = 0.96208937 Iteration 757, loss = 0.95992965 Iteration 758, loss = 0.95570506 Iteration 759, loss = 0.95622737 Iteration 760, loss = 0.95810573 Iteration 761, loss = 0.95757574 Iteration 762, loss = 0.95564320 Iteration 763, loss = 0.95917253 Iteration 764, loss = 0.95796832 Iteration 765, loss = 0.96836469 Iteration 766, loss = 0.95883179 Iteration 767, loss = 0.95967417 Iteration 768, loss = 0.95909964 Iteration 769, loss = 0.95848308 Iteration 770, loss = 0.95825666 Iteration 771, loss = 0.95881348 Iteration 772, loss = 0.96071532 Iteration 773, loss = 0.95869658 Iteration 774, loss = 0.95985350 Iteration 775, loss = 0.95895232 Iteration 776, loss = 0.95674190 Iteration 777, loss = 0.95811786 Iteration 778, loss = 0.95827756 Iteration 779, loss = 0.96069012 Iteration 780, loss = 0.95686907 Iteration 781, loss = 0.95413799 Iteration 782, loss = 0.95571093 Iteration 783, loss = 0.95708236 Iteration 784, loss = 0.95669916 Iteration 785, loss = 0.95738822 Iteration 786, loss = 0.95678220 Iteration 787, loss = 0.95576547 Iteration 788, loss = 0.95763468 Iteration 789, loss = 0.95903518 Iteration 790, loss = 0.95603462 Iteration 791, loss = 0.95868709 Iteration 792, loss = 0.95344084 Iteration 793, loss = 0.95739098 Iteration 794, loss = 0.95496932 Iteration 795, loss = 0.95551714 Iteration 796, loss = 0.95856204 Iteration 797, loss = 0.95517915 Iteration 798, loss = 0.95629457 Iteration 799, loss = 0.95561760 Iteration 800, loss = 0.95559318 Iteration 801, loss = 0.95473840 Iteration 802, loss = 0.95788190 Iteration 803, loss = 0.95736380 Iteration 804, loss = 0.95770359 Iteration 805, loss = 0.95696434 Iteration 806, loss = 0.95324998 Iteration 807, loss = 0.95736242 Iteration 808, loss = 0.95836191 Iteration 809, loss = 0.95327325 Iteration 810, loss = 0.95369478 Iteration 811, loss = 0.95378183 Iteration 812, loss = 0.95433968 Iteration 813, loss = 0.95632250 Iteration 814, loss = 0.95498170 Iteration 815, loss = 0.95478973 Iteration 816, loss = 0.95403654 Iteration 817, loss = 0.95308105 Iteration 818, loss = 0.95171833 Iteration 819, loss = 0.95766241 Iteration 820, loss = 0.95528185 Iteration 821, loss = 0.95391841 Iteration 822, loss = 0.95508203 Iteration 823, loss = 0.95504502 Iteration 824, loss = 0.95552004 Iteration 825, loss = 0.95967881 Iteration 826, loss = 0.95417388 Iteration 827, loss = 0.95074056 Iteration 828, loss = 0.95035048 Iteration 829, loss = 0.95618663 Iteration 830, loss = 0.95951634 Iteration 831, loss = 0.95119299 Iteration 832, loss = 0.95132272 Iteration 833, loss = 0.95444678 Iteration 834, loss = 0.95790596 Iteration 835, loss = 0.95447477 Iteration 836, loss = 0.95402139 Iteration 837, loss = 0.95220093 Iteration 838, loss = 0.95212948 Iteration 839, loss = 0.95779971 Iteration 840, loss = 0.95533843 Iteration 841, loss = 0.95378069 Iteration 842, loss = 0.94971552 Iteration 843, loss = 0.95107341 Iteration 844, loss = 0.95390567 Iteration 845, loss = 0.95760361 Iteration 846, loss = 0.95526078 Iteration 847, loss = 0.95445978 Iteration 848, loss = 0.95235797 Iteration 849, loss = 0.95256678 Iteration 850, loss = 0.95191873 Iteration 851, loss = 0.95281916 Iteration 852, loss = 0.95287419 Iteration 853, loss = 0.95266077 Iteration 854, loss = 0.95297615 Iteration 855, loss = 0.95409058 Iteration 856, loss = 0.95481132 Iteration 857, loss = 0.95543772 Iteration 858, loss = 0.94992507 Iteration 859, loss = 0.95386613 Iteration 860, loss = 0.95155044 Iteration 861, loss = 0.95048306 Iteration 862, loss = 0.95400506 Iteration 863, loss = 0.95685831 Iteration 864, loss = 0.95313773 Iteration 865, loss = 0.95055493 Iteration 866, loss = 0.94908949 Iteration 867, loss = 0.95586962 Iteration 868, loss = 0.95372943 Iteration 869, loss = 0.95247285 Iteration 870, loss = 0.95788666 Iteration 871, loss = 0.95294041 Iteration 872, loss = 0.95978346 Iteration 873, loss = 0.95312440 Iteration 874, loss = 0.95333451 Iteration 875, loss = 0.94959512 Iteration 876, loss = 0.94982775 Iteration 877, loss = 0.94965943 Iteration 878, loss = 0.95212885 Iteration 879, loss = 0.95413283 Iteration 880, loss = 0.95397950 Iteration 881, loss = 0.94976527 Iteration 882, loss = 0.95251817 Iteration 883, loss = 0.95011802 Iteration 884, loss = 0.94895472 Iteration 885, loss = 0.95326543 Iteration 886, loss = 0.94893769 Iteration 887, loss = 0.95603864 Iteration 888, loss = 0.95383445 Iteration 889, loss = 0.94975378 Iteration 890, loss = 0.94993035 Iteration 891, loss = 0.95195762 Iteration 892, loss = 0.95844972 Iteration 893, loss = 0.94944678 Iteration 894, loss = 0.95064389 Iteration 895, loss = 0.94902884 Iteration 896, loss = 0.95002580 Iteration 897, loss = 0.95953787 Iteration 898, loss = 0.95215912 Iteration 899, loss = 0.94778510 Iteration 900, loss = 0.94985311 Iteration 901, loss = 0.95141796 Iteration 902, loss = 0.94762155 Iteration 903, loss = 0.94948003 Iteration 904, loss = 0.95218773 Iteration 905, loss = 0.95406503 Iteration 906, loss = 0.94919802 Iteration 907, loss = 0.95183999 Iteration 908, loss = 0.94832895 Iteration 909, loss = 0.95477791 Iteration 910, loss = 0.95196658 Iteration 911, loss = 0.94605453 Iteration 912, loss = 0.94988448 Iteration 913, loss = 0.95431598 Iteration 914, loss = 0.95145045 Iteration 915, loss = 0.94843523 Iteration 916, loss = 0.94849248 Iteration 917, loss = 0.94728257 Iteration 918, loss = 0.95229570 Iteration 919, loss = 0.95226867 Iteration 920, loss = 0.95043476 Iteration 921, loss = 0.94798330 Iteration 922, loss = 0.95221865 Iteration 923, loss = 0.94770476 Iteration 924, loss = 0.94810415 Iteration 925, loss = 0.95427917 Iteration 926, loss = 0.94938989 Iteration 927, loss = 0.95012598 Iteration 928, loss = 0.95096376 Iteration 929, loss = 0.94933504 Iteration 930, loss = 0.94819340 Iteration 931, loss = 0.94786895 Iteration 932, loss = 0.94610640 Iteration 933, loss = 0.95118856 Iteration 934, loss = 0.95190303 Iteration 935, loss = 0.94902702 Iteration 936, loss = 0.94747749 Iteration 937, loss = 0.94872279 Iteration 938, loss = 0.94658823 Iteration 939, loss = 0.95022221 Iteration 940, loss = 0.94623700 Iteration 941, loss = 0.94620242 Iteration 942, loss = 0.94773136 Iteration 943, loss = 0.96376294 Iteration 944, loss = 0.95141911 Iteration 945, loss = 0.94543027 Iteration 946, loss = 0.95182442 Iteration 947, loss = 0.94513832 Iteration 948, loss = 0.94519869 Iteration 949, loss = 0.94819082 Iteration 950, loss = 0.94982362 Iteration 951, loss = 0.95075012 Iteration 952, loss = 0.94537101 Iteration 953, loss = 0.94786442 Iteration 954, loss = 0.94994196 Iteration 955, loss = 0.94928750 Iteration 956, loss = 0.94495266 Iteration 957, loss = 0.94825147 Iteration 958, loss = 0.94765257 Iteration 959, loss = 0.94827578 Iteration 960, loss = 0.95020176 Iteration 961, loss = 0.95526433 Iteration 962, loss = 0.94468167 Iteration 963, loss = 0.94692758 Iteration 964, loss = 0.94554306 Iteration 965, loss = 0.95135177 Iteration 966, loss = 0.94947833 Iteration 967, loss = 0.94728780 Iteration 968, loss = 0.94726523 Iteration 969, loss = 0.94826572 Iteration 970, loss = 0.94815294 Iteration 971, loss = 0.94585282 Iteration 972, loss = 0.94611007 Iteration 973, loss = 0.94919643 Iteration 974, loss = 0.94708304 Iteration 975, loss = 0.95878057 Iteration 976, loss = 0.94514645 Iteration 977, loss = 0.94539509 Iteration 978, loss = 0.94450988 Iteration 979, loss = 0.95014344 Iteration 980, loss = 0.95023596 Iteration 981, loss = 0.94513496 Iteration 982, loss = 0.94756331 Iteration 983, loss = 0.94350910 Iteration 984, loss = 0.95201986 Iteration 985, loss = 0.95114107 Iteration 986, loss = 0.94510287 Iteration 987, loss = 0.94747280 Iteration 988, loss = 0.94425552 Iteration 989, loss = 0.94522645 Iteration 990, loss = 0.94763969 Iteration 991, loss = 0.94514439 Iteration 992, loss = 0.94815275 Iteration 993, loss = 0.95311609 Iteration 994, loss = 0.94729790 Iteration 995, loss = 0.94438312 Iteration 996, loss = 0.94630248 Iteration 997, loss = 0.94849523 Iteration 998, loss = 0.94650992 Iteration 999, loss = 0.94496459 Iteration 1000, loss = 0.95560568 precision recall f1-score support 0.0 0.40 0.35 0.37 279 1.0 0.31 0.32 0.31 253 2.0 0.38 0.41 0.39 298 accuracy 0.36 830 macro avg 0.36 0.36 0.36 830 weighted avg 0.36 0.36 0.36 830
C:\Users\Bruger\AppData\Roaming\Python\Python310\site-packages\sklearn\neural_network\_multilayer_perceptron.py:692: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet. warnings.warn(
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.99)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,),learning_rate='invscaling',learning_rate_init=0.0005, n_iter_no_change=50, max_iter=1000)
# rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc.predict(X_test)))
Iteration 1, loss = 2.24735406 Iteration 2, loss = 1.21552970 Iteration 3, loss = 1.19370462 Iteration 4, loss = 1.21115846 Iteration 5, loss = 1.17688781 Iteration 6, loss = 1.18027952 Iteration 7, loss = 1.23369810 Iteration 8, loss = 1.18573559 Iteration 9, loss = 1.19624951 Iteration 10, loss = 1.21515312 Iteration 11, loss = 1.16470400 Iteration 12, loss = 1.18113593 Iteration 13, loss = 1.20423397 Iteration 14, loss = 1.17088834 Iteration 15, loss = 1.18582957 Iteration 16, loss = 1.16356575 Iteration 17, loss = 1.20478582 Iteration 18, loss = 1.18247954 Iteration 19, loss = 1.20287715 Iteration 20, loss = 1.19825816 Iteration 21, loss = 1.17343568 Iteration 22, loss = 1.21634638 Iteration 23, loss = 1.16178581 Iteration 24, loss = 1.18911451 Iteration 25, loss = 1.21030443 Iteration 26, loss = 1.18205776 Iteration 27, loss = 1.18256996 Iteration 28, loss = 1.18050753 Iteration 29, loss = 1.19175628 Iteration 30, loss = 1.17336852 Iteration 31, loss = 1.16042532 Iteration 32, loss = 1.21768535 Iteration 33, loss = 1.18327870 Iteration 34, loss = 1.18122359 Iteration 35, loss = 1.17272692 Iteration 36, loss = 1.19471285 Iteration 37, loss = 1.18404735 Iteration 38, loss = 1.16932872 Iteration 39, loss = 1.19766431 Iteration 40, loss = 1.15939963 Iteration 41, loss = 1.20109253 Iteration 42, loss = 1.19275179 Iteration 43, loss = 1.15824389 Iteration 44, loss = 1.16985967 Iteration 45, loss = 1.17186164 Iteration 46, loss = 1.21131221 Iteration 47, loss = 1.20282030 Iteration 48, loss = 1.17930497 Iteration 49, loss = 1.18649115 Iteration 50, loss = 1.16293083 Iteration 51, loss = 1.15588926 Iteration 52, loss = 1.18370429 Iteration 53, loss = 1.18158868 Iteration 54, loss = 1.17257321 Iteration 55, loss = 1.17130212 Iteration 56, loss = 1.17730304 Iteration 57, loss = 1.16594857 Iteration 58, loss = 1.16755209 Iteration 59, loss = 1.18639177 Iteration 60, loss = 1.17229275 Iteration 61, loss = 1.18851070 Iteration 62, loss = 1.20323167 Iteration 63, loss = 1.16724971 Iteration 64, loss = 1.18784351 Iteration 65, loss = 1.18602152 Iteration 66, loss = 1.16949703 Iteration 67, loss = 1.14618922 Iteration 68, loss = 1.17081160 Iteration 69, loss = 1.18872349 Iteration 70, loss = 1.17506416 Iteration 71, loss = 1.18032379 Iteration 72, loss = 1.17302062 Iteration 73, loss = 1.17010463 Iteration 74, loss = 1.15879612 Iteration 75, loss = 1.17953721 Iteration 76, loss = 1.17128939 Iteration 77, loss = 1.17689344 Iteration 78, loss = 1.15931033 Iteration 79, loss = 1.17813790 Iteration 80, loss = 1.17955940 Iteration 81, loss = 1.17565941 Iteration 82, loss = 1.17165024 Iteration 83, loss = 1.17217349 Iteration 84, loss = 1.17035018 Iteration 85, loss = 1.18987476 Iteration 86, loss = 1.15939745 Iteration 87, loss = 1.17402584 Iteration 88, loss = 1.16124755 Iteration 89, loss = 1.15575825 Iteration 90, loss = 1.16658888 Iteration 91, loss = 1.15050259 Iteration 92, loss = 1.16886458 Iteration 93, loss = 1.17024630 Iteration 94, loss = 1.16159329 Iteration 95, loss = 1.17196056 Iteration 96, loss = 1.17526487 Iteration 97, loss = 1.18245704 Iteration 98, loss = 1.18631733 Iteration 99, loss = 1.19449147 Iteration 100, loss = 1.15374104 Iteration 101, loss = 1.15524783 Iteration 102, loss = 1.15317664 Iteration 103, loss = 1.17060170 Iteration 104, loss = 1.19585260 Iteration 105, loss = 1.17558026 Iteration 106, loss = 1.18021167 Iteration 107, loss = 1.17267092 Iteration 108, loss = 1.19215996 Iteration 109, loss = 1.18197683 Iteration 110, loss = 1.15782534 Iteration 111, loss = 1.17584200 Iteration 112, loss = 1.15206897 Iteration 113, loss = 1.16216539 Iteration 114, loss = 1.15278872 Iteration 115, loss = 1.16518025 Iteration 116, loss = 1.16373192 Iteration 117, loss = 1.14691320 Iteration 118, loss = 1.16267827 Training loss did not improve more than tol=0.000100 for 50 consecutive epochs. Stopping. precision recall f1-score support 0.0 0.35 0.39 0.36 279 1.0 0.33 0.45 0.38 257 2.0 0.35 0.20 0.25 294 accuracy 0.34 830 macro avg 0.34 0.34 0.33 830 weighted avg 0.34 0.34 0.33 830
pca_3d = PCA(n_components=3)
clustered_dataset_3d = pca_3d.fit_transform(df_balanced)
def plot3dwithspike(width, height, title, datapoints, myLabel=None) :
plt.figure(figsize=(width,height))
plt.title(title, fontsize='medium')
ax = plt.axes(projection='3d')
ax.scatter3D(datapoints[:, 0], datapoints[:,1], datapoints[:,2], c=myLabel, marker='o', s=15, edgecolor='k')
plt.show()
plot3dwithspike(20,20,"PCA3 donation data",clustered_dataset_3d,balanced_labels)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.99)
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(novelty=True)
clf.fit(X_train)
clf.predict(X_test)
C:\Users\Bruger\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])