Skip to content

Commit

Permalink
string enum
Browse files Browse the repository at this point in the history
  • Loading branch information
robertfrankzhang committed Jun 16, 2020
1 parent 4b79eda commit 9540603
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 57 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
setup(
name = 'scikit-eLCS',
packages = ['skeLCS'],
version = '1.2.1',
version = '1.2.2',
license='License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
description = 'Educational Learning Classifier System',
long_description_content_type="text/markdown",
author = 'Robert Zhang, Ryan J. Urbanowicz',
author_email = 'robertzh@seas.upenn.edu,ryanurb@upenn.edu',
url = 'https://github.com/UrbsLab/scikit-eLCS',
download_url = 'https://github.com/UrbsLab/scikit-eLCS/archive/v_1.2.1.tar.gz',
download_url = 'https://github.com/UrbsLab/scikit-eLCS/archive/v_1.2.2.tar.gz',
keywords = ['machine learning','data analysis','data science','learning classifier systems'],
install_requires=['numpy','pandas','scikit-learn'],
classifiers=[
Expand Down
86 changes: 31 additions & 55 deletions skeLCS/DataCleanup.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
'''
Name: eLCS.py
Authors: Robert Zhang in association with Ryan Urbanowicz
Contact: robertzh@wharton.upenn.edu
Description: This module creates a class that takes in data, and cleans it up to be used by another machine learning module
'''


import numpy as np
import pandas as pd
Expand Down Expand Up @@ -106,11 +101,11 @@ def add_attribute_converter_map(self,headerName,map):
def add_attribute_converter_random(self,headerName):
if headerName in self.dataHeaders and not (headerName in self.map):
headerIndex = np.where(self.dataHeaders == headerName)[0][0]
uniqueItems = np.array([])
uniqueItems = []
for instance in self.dataFeatures:
if not(instance[headerIndex] in uniqueItems) and instance[headerIndex] != "NA":
uniqueItems = np.append(uniqueItems,instance[headerIndex])
self.add_attribute_converter(headerName,uniqueItems)
uniqueItems.append(instance[headerIndex])
self.add_attribute_converter(headerName,np.array(uniqueItems))

def add_class_converter(self,array):
if not (self.classLabel in self.map.keys()):
Expand All @@ -121,11 +116,11 @@ def add_class_converter(self,array):

def add_class_converter_random(self):
if not (self.classLabel in self.map.keys()):
uniqueItems = np.array([])
uniqueItems = []
for instance in self.dataPhenotypes:
if not (instance in uniqueItems) and instance != "NA":
uniqueItems = np.append(uniqueItems, instance)
self.add_class_converter(uniqueItems)
uniqueItems.append(instance)
self.add_class_converter(np.array(uniqueItems))

def convert_all_attributes(self):
for attribute in self.dataHeaders:
Expand All @@ -144,56 +139,43 @@ def convert_all_attributes(self):
def delete_attribute(self,headerName):
if headerName in self.dataHeaders:
i = np.where(headerName == self.dataHeaders)[0][0]
newFeatures = np.array([[2,3]])
self.dataHeaders = np.delete(self.dataHeaders,i)
if headerName in self.map.keys():
del self.map[headerName]

newFeatures = []
for instanceIndex in range(len(self.dataFeatures)):
instance = np.delete(self.dataFeatures[instanceIndex],i)
if (instanceIndex == 0):
newFeatures = np.array([instance])
else:
newFeatures = np.concatenate((newFeatures,[instance]),axis=0)
self.dataFeatures = newFeatures
newFeatures.append(instance)
self.dataFeatures = np.array(newFeatures)
else:
raise Exception("Header Doesn't Exist")

def delete_all_instances_without_header_data(self,headerName):
newFeatures = np.array([[2,3]])
newPhenotypes = np.array([])
newFeatures = []
newPhenotypes = []
attributeIndex = np.where(self.dataHeaders == headerName)[0][0]

firstTime = True
for instanceIndex in range(len(self.dataFeatures)):
instance = self.dataFeatures[instanceIndex]
if instance[attributeIndex] != "NA":
if firstTime:
firstTime = False
newFeatures = np.array([instance])
else:
newFeatures = np.concatenate((newFeatures,[instance]),axis = 0)
newPhenotypes = np.append(newPhenotypes,self.dataPhenotypes[instanceIndex])
newFeatures.append(instance)
newPhenotypes.append(self.dataPhenotypes[instanceIndex])

self.dataFeatures = newFeatures
self.dataPhenotypes = newPhenotypes
self.dataFeatures = np.array(newFeatures)
self.dataPhenotypes = np.array(newPhenotypes)

def delete_all_instances_without_phenotype(self):
newFeatures = np.array([[2,3]])
newPhenotypes = np.array([])
firstTime = True
newFeatures = []
newPhenotypes = []
for instanceIndex in range(len(self.dataFeatures)):
instance = self.dataPhenotypes[instanceIndex]
if instance != "NA":
if firstTime:
firstTime = False
newFeatures = np.array([self.dataFeatures[instanceIndex]])
else:
newFeatures = np.concatenate((newFeatures,[self.dataFeatures[instanceIndex]]),axis = 0)
newPhenotypes = np.append(newPhenotypes,instance)
newFeatures.append(self.dataFeatures[instanceIndex])
newPhenotypes.append(instance)

self.dataFeatures = newFeatures
self.dataPhenotypes = newPhenotypes
self.dataFeatures = np.array(newFeatures)
self.dataPhenotypes = np.array(newPhenotypes)

def print(self):
isFullNumber = self.check_is_full_numeric()
Expand Down Expand Up @@ -247,26 +229,20 @@ def get_params(self):
if not(self.check_is_full_numeric()):
raise Exception("Features and Phenotypes must be fully numeric")

newFeatures = np.array([[2,3]],dtype=float)
newPhenotypes = np.array([],dtype=float)
firstTime = True
newFeatures = []
newPhenotypes = []
for instanceIndex in range(len(self.dataFeatures)):
newInstance = np.array([],dtype=float)
newInstance = []
for attribute in self.dataFeatures[instanceIndex]:
if attribute == "NA":
newInstance = np.append(newInstance, np.nan)
newInstance.append(np.nan)
else:
newInstance = np.append(newInstance, float(attribute))

if firstTime:
firstTime = False
newFeatures = np.array([newInstance])
else:
newFeatures = np.concatenate((newFeatures,[newInstance]),axis = 0)
newInstance.append(float(attribute))

newFeatures.append(np.array(newInstance,dtype=float))
if self.dataPhenotypes[instanceIndex] == "NA": #Should never happen. All NaN phenotypes should be removed automatically at init. Just a safety mechanism.
newPhenotypes = np.append(newPhenotypes, np.nan)
newPhenotypes.append(np.nan)
else:
newPhenotypes = np.append(newPhenotypes, float(self.dataPhenotypes[instanceIndex]))
newPhenotypes.append(float(self.dataPhenotypes[instanceIndex]))

return self.dataHeaders,self.classLabel,newFeatures,newPhenotypes
return self.dataHeaders,self.classLabel,np.array(newFeatures,dtype=float),np.array(newPhenotypes,dtype=float)

0 comments on commit 9540603

Please sign in to comment.