
    TfB<                         d Z ddlZddlmc mZ ddlZddlZddl	m
Z
 ej                            d            G d de          Z G d de          Zd	 ZddZdS )z5
Created on Sat May 18 19:24:45 2024

@author: atdou
    N)pyplot   c                   *    e Zd ZdZd Zd Zd Zd ZdS )create_regress_dataz
    Description
    -----------
    Creates an example dataset of arbitrary dimensions, roughly following a regression function, f, 
    and also populated with outliers    
    c                 6    t          j                    | _        dS )aw  
        Parameters
        ----------
        dim: int
            the dimension of the X data
        
        Attributes
        ----------
        dim: int
        data: pandas Datafarme
            this is the dataframe we're trying to create.  It will ultimately be of the form:
            ["x_0", "x_1", ..., "x_n", "y", "f", "Residuals", "Outlying?"]       
        Npandas	DataFramedataselfs    bC:\Users\atdou\OneDrive\Desktop\Files\Coding\Python\Programs\XtraMLTools\test_auxiliary_classes.py__init__zcreate_regress_data.__init__        $&&			    c                    || _         || _        || _        || _        || _        t
          j                            | j         |z
  | j        f          | j        | j        z
  z  | j        z   | _        t
          j                            || j        f          | j        | j        z
  z  | j        z   | _        t          j
        t          j        | j        | j        f          d t          | j                  D                       | _        dS )b   
        Description
        -----------
        generating N random points along x axis, n_out of which are (possible) outliers.  
        
        Parameters
        ----------
        x_min: float
        x_max: float
        N: int
            number of total points, including outliers
        n_out: int 
            the number of outliers
        dim: int
            the dimension of X/number of columns of X
            
        Attributes
        ----------
        N: int
        n_out: int
        dim: int
        x_min: float
        x_max: float
        X_norm: numpy array
            of X coordinates of going to be ostensibly normal y points
        X_out: numpy array
            of X coordinates of going to be ostensibly outlier y points
        X: pandas DataFrame
            X_norm and X_out concatenated and turned into a dataframe
        c                 2    g | ]}d t          |          z   S x_str.0ns     r   
<listcomp>z-create_regress_data.set_X.<locals>.<listcomp>K   s"    ,R,R,RQT#a&&[,R,R,Rr   columnsN)Nn_outdimx_minx_maxnumpyrandomX_normX_outr	   r
   concatenaterangeXr   r"   r#   r   r    r!   s         r   set_Xzcreate_regress_data.set_X$   s    > 


l))46%<*ABBDJtzDYZ[_[ee\((%)9::DJtz<QRSWS]]
!%"3T[$*4M"N"N,R,R%//,R,R,RT T Tr   c                     | _         t          j         fd j        D                       }t          j         fd j        D                       }||t          j                             j         j        z
            z  z   }|t          j        	                    || j                  z   }t          j        t          j        ||f                     _        t          j        t          j        ||f                     _        dS )af   
        Description
        -----------
        generating N random points along y axis, roughly tracking f, n_out of which are outliers
        The random points follow a normal distribution about the f with standard deviation specified.  
        The possible outliers are set to follow a uniform distribution with a user specified min/max range 
        added to self.  Since these points are random, some of the normal points may actually end up as outliers,
        and not all of the outlier points will necessarily end up as outliers.  
        
        Parameters
        ----------
        f: function
            the function mapping the rows of X.  Should be something like f(x) = 3*x[0]*x[1] -9*x[2]**2 + 7.
            In this example, x[0], x[1], x[2] reference the first, second, and third columns.  
        dev: float 
            strengh of random fluctuations about f of the normal random points
        min_y: float
            minimum value of uniform distribution for ostensible outlier points
        max_y: float
            maximum value of uniform distribution for ostensible outlier points
            
        Attributes
        ----------
        func: function
            this is f
        f: pandas DataFrame
            this is f evaluated at all X points
        y: pandas DataFrame
            y-coordinates of normal and outlier points
        c                 :    g | ]}                     |          S  funcr   xr   s     r   r   z-create_regress_data.set_y.<locals>.<listcomp>m   s#    @@@qdiill@@@r   c                 :    g | ]}                     |          S r/   r0   r2   s     r   r   z-create_regress_data.set_y.<locals>.<listcomp>n   s#    >>>aTYYq\\>>>r   )sizeN)r1   r$   arrayr&   r'   r%   normalr   r    uniformr	   Seriesr(   fy)	r   r1   devmin_ymax_yf_normf_outy_normy_outs	   `        r   set_yzcreate_regress_data.set_yM   s    > 	@@@@DK@@@AA>>>>4:>>>??3u|22tz8I2JJJJ,,UE
,KKKu0&%AABBu0&%AABBr   c                    t          j        | j        dg          }t          j        | j        dg          }t          j        | j        ||gd          | _        | j        d         | j        d         z
  | j        d<   || _        || _        t          j
        | j        d         g d          }|d         |d         z
  }| j        d                                         }| j        d                                         }| j        d	k    r'|d         | j        |z  z
  |d         | j        |z  z   }
}	n%| j        d
k    r|| j        |z  z
  || j        |z  z   }
}	g }t          t          | j                            D ]]}| j        j        |df         |	k     s| j        j        |df         |
k    r|                    d           H|                    d           ^|| j        d<   dS )a@  
        Description
        -----------
        Now we concatenate the X, and y, and f points.  And we classify points as outlying or not.  
        
        Parameters
        ----------
        metric: string
            this is "IQR", or "std"
        factor: float
            this is the parameter that multiplies either IQR = Q[3] - Q[1], or std, when classifying outlies, i.e.,
            (Q[1] - factor*IQR, Q[3] + factor*IQR), or (mean - factor*std, mean + factor*std)
    
        Attributes
        ----------
        metric: string
        factor: float
        r:   r   r;      )axis	Residuals)r   g      ?g      ?g      ?rE      IQRstdTF	Outlying?N)r	   r
   r:   r;   concatr*   r   metricfactorr$   quantilemeanrJ   r)   lenlocappend)r   rM   rN   r:   r;   QrI   rP   rJ   Res_low_boundRes_high_boundrow_outlier_setis                r   classify_outliersz%create_regress_data.classify_outlierst   s   & TVu555TVu555M461a.q999	!%3$)C.!@	+N49[12F2F2FGGd1Q4iy%**,,i$((**;%,-aD4;s?,BAaD4;WZ?DZ>MM[E!!,0S,@$t{SVBV>Ms49~~&& 	. 	.A	am,}<<$)-PQR]P]B^aoBoBo&&t,,,,&&u----!0	+r   N)__name__
__module____qualname____doc__r   r,   rC   rY   r/   r   r   r   r      sb         ' ' ' 'T 'T 'TR%C %C %CN'1 '1 '1 '1 '1r   r   c                   $    e Zd ZdZd Zd Zd ZdS )create_classy_dataz
    Description
    -----------
    Creates an example dataset of arbitrary dimensions, roughly following a classification function, f, 
    and also populated with outliers    
    c                 6    t          j                    | _        dS )a\  
        Parameters
        ----------
        dim: int
            the dimension of the X data
        
        Attributes
        ----------
        dim: int
        data: pandas Datafarme
            this is the dataframe we're trying to create.  It will ultimately be of the form:
            ["x_0", "x_1", ..., "x_n", "Class"]       
        Nr   r   s    r   r   zcreate_classy_data.__init__   r   r   c                 *   || _         || _        || _        || _        || _        t
          j                            | j         |z
  | j        f          | j        | j        z
  z  | j        z   | _        t
          j                            || j        f          | j        | j        z
  z  | j        z   | _        t          j	        | j        | j        f          | _
        t          j        | j
        d t          | j                  D                       | _        dS )r   c                 2    g | ]}d t          |          z   S r   r   r   s     r   r   z,create_classy_data.set_X.<locals>.<listcomp>   s"    :`:`:`14A;:`:`:`r   r   N)r   r    r!   r"   r#   r$   r%   r&   r'   r(   X_arrayr	   r
   r)   r*   r+   s         r   r,   zcreate_classy_data.set_X   s    > 


l))46%<*ABBDJtzDYZ[_[ee\((%)9::DJtz<QRSWS]]
($+tz)BCC!$,:`:`PUVZV^P_P_:`:`:`aaar   c                     | _         | _        t           j                  t          j        d t           j         j        z
            D                       }t          j        d t           j                  D                       }t          j        ||f          }t          j         fd j	        D                       }fd}	t          j
        |	dd          }
t          j         |
||                                        d           _         j                                         _        | j        d<   | j        d	<    j         j        d
<   dS )a`   
        Description
        -----------
        generating N random points along y axis, roughly tracking f, n_out of which are outliers.  
        The random points follow a multinomial distribution about f.  Their classification according to f gets a 
        probability specified p_norm = (0,1), wuile the other possibilities equally divide up the remaining probability 
        q_norm = 1-p_norm.  The outlier points follow a similar multinomial distribution function.  p_out = (0,1) is 
        the probability they take on the value that f tells them to, and the remaining possibilities equally split 
        q_out = 1 - p_out.  
        
        Parameters
        ----------
        f: function
            the function mapping the rows of X to a given class value.  Should be something like 
            f(x) = 1 if x[0] < 0, 2 if x[0] > 0, etc.  Can obviously be more complicated than this.  
        f_values: list
            of the possible values f can output.  These would be the numbers associated with the different classes
        p_norm: float 
            probability the normal points assume value f says they should.
        p_out: float
            probability the outlier points assume value f says they should.
            
        Attributes
        ----------
        func: function
            this is f
        f_values: list
            this is the list of possible values f can output.  This should be integers starting from 0.  
        f: pandas DataFrame
            this is f evaluated at all X points
        y: pandas DataFrame
            y-coordinates of normal and outlier points
        c                     g | ]}d S )Fr/   r   js     r   r   z,create_classy_data.set_y.<locals>.<listcomp>  s    *U*U*UQ5*U*U*Ur   c                     g | ]}d S )Tr/   rf   s     r   r   z,create_classy_data.set_y.<locals>.<listcomp>  s    )J)J)J1$)J)J)Jr   c                 :    g | ]}                     |          S r/   r0   r2   s     r   r   z,create_classy_data.set_y.<locals>.<listcomp>  s#    BBBtyy||BBBr   c                     | dk    rnfdt                    D             }||<   t          j        t          j                            d|                    }|S )NFc                 &    g | ]}d z
  d z
  z  S )rE   r/   )r   rg   
num_valuesps     r   r   zDcreate_classy_data.set_y.<locals>.class_assigner.<locals>.<listcomp>	  s&    EEEaacJqL)EEEr   rE   )r)   r$   argmaxr%   multinomial)sr:   probscrm   rl   p_normp_outs       @r   class_assignerz0create_classy_data.set_y.<locals>.class_assigner  sh    U((AEEEEE53D3DEEEEE!HU\55a??@@AHr      rE   intrK   r:   r;   N)r1   f_valuesrQ   r$   r6   r)   r   r    r(   rc   
frompyfuncr	   r9   astyper;   r*   copyr   )r   r1   rx   rs   rt   outlier_status_normoutlier_status_outoutlier_status_arrayf_arrayru   num_class_assignerrl   s   `  ``      @r   rC   zcreate_classy_data.set_y   s_   D 	 ''
#k*U*U%@S:T:T*U*U*UVV"[)J)Jdj8I8I)J)J)JKK$02EGY1Z[[+BBBBT\BBBCC	 	 	 	 	 	 	 #-naCC112FPPQQXXY^__FKKMM	!5	+ 	#	#r   N)rZ   r[   r\   r]   r   r,   rC   r/   r   r   r_   r_      sO         ' ' ' 'b 'b 'bR4  4  4  4  4 r   r_   c                    i }|D ]}||         }| d         dk    |d         dk    z                                   }| d         dk    |d         dk    z                                   }| d         dk    |d         dk    z                                   }| d         dk    |d         dk    z                                   }t          j        ||g||gg          ||<   |S )a  
    Description
    -----------
    outputs a dictionary of classification matrices comparing ROR outlier predictions on data_exp vs their actual status.  
    Should create a data object and then fit or transform a ROR on it.  Then feed the data object into data_exp argument above,
    and then feed in the dictionary of dataframes desired from ROR.  
        
    Parameters
    ----------
    data_exp: pandas dataframe
        this is a dataframe object as created above in that data class, i.e., data.data
    data_dict: dictionary of pandas dataframes
        should fit and or transform ROR to data.data.  Then can create a dictionary of fit or transform 
        df's with columns ["y_pred", "Residuals", "Outlying_Predictions"].  These are found in ROR.train_data, 
        ROR.train_data_ave, and ROR.test_data, ROR.test_data_ave.  So can test any of these guys.  
        
    Returns
    -------
    dictionary:
        of classification matrices
    rK   TOutlying_PredictionF)sumr$   r6   )	data_exp	data_dictCM_dictnamer   TPFPFNTNs	            r   CM_ROR_predictionsr     s    , G 7 7$d*t4I/JD/PQVVXX$e+5J0KT0QRWWYY$d*t4I/JE/QRWWYY$e+5J0KU0RSXXZZbWbW$566Nr   x_0c           	      |   d}d}| |                                          | |                                         z
  }| d                                          | d                                         z
  }| |                                         d|z  z
  }| |                                          d|z  z   }| d                                         d|z  z
  }	| d                                          d|z  z   }
| d                             ||d          | d<   t          j                     t          j                    }|                     |          }|                    ||         |d	         d
           |                    | |         | d         | d         d           |	                    t          t          j        ||d                               |                    t          t          j        |	|
d                               |                    ||           |                    d           |                    |	|
           |                    d           |                    d           t          j                     |dk    rg d}d}|D ]}||         }||dz           }|j        }|                    | |         |         |d         |         d||           |                    | |         |d                  | d         |d                  |ddd           |                                 |dz  }dS dS )a  
    Description
    -----------
    in data_exp.data, it plots user specified col_X vs. y, coloring the normal points grey, and the outlier points red.  
    and it does the same for each dataframe in the data_dict.  Further, it circles the outlier predictions of the models''
    data in the data_dict, so can compare actual outliers (red) with predicted (green circles).  x_min/max, y_min/max
    set the edges of the graph. 

    Parameters
    ----------
    data_exp : pandas dataframe, specifically of format create_data.data
        this is the experimental data that has the true outliers classed.
    data_dict : dictionary of pandas dataframes
        the dataframes could be from a ROR object, in ROR.train_data[m][n], and ROR.test_data[1], and ROR.train_data_ave, and 
        ROR.test_data_ave, etc.  
    col_X : string
        the column in X that we want to plot vs. y.  As X can have multiple columns.
    greyredr;   g?rK   )TFColor)byr:   black)color   )r   rp      r3   z!Feature Space and Actual OutliersN)darkblue	orangered	limegreenpurpler   y_predrv   )	linewidthr   labelr   noneod   )
edgecolors
facecolorsmarkerrp   rE   )maxminmapr   figureaxessort_valuesplotscatter
set_xtickslistr$   linspace
set_yticksset_xlim
set_xlabelset_ylim
set_ylabel	set_titlegridindexlegend)r   r   col_X
color_norm	color_outx_rangey_ranger"   r#   y_miny_maxaxdata_exp_sortedcolorsrg   r   r   hueindicess                      r   plot_ROR_predictionsr   6  s   & JIuo!!##huo&9&9&;&;;Gsm!!HSM$5$5$7$77GUO!!CK/EUO!!CK/ESM#g+-ESM#g+-E -11:2V2VWWHW
MOOO	B**e*44OGGOE"OC$8'GJJJJJxx7HAJNNNMM$u~eUA6677888MM$u~eUA6677888KKuMM#KKuMM#LL4555
KMMMDAAA 	 	DT?D1+C%+GGGHUOG,d8nW.EQR\_imGnnnJJxt,A'BCXc]SWXmSnEo#&Vcs  T T TIIKKKqDAA 	 	r   )r   )r]   builtins@py_builtins_pytest.assertion.rewrite	assertionrewrite
@pytest_arr	   r$   
matplotlibr   r%   seedobjectr   r_   r   r   r/   r   r   <module>r      s                                  "   N1 N1 N1 N1 N1& N1 N1 N1bt  t  t  t  t  t  t  t n  B4 4 4 4 4 4r   