معرفی شرکت ها

تبلیغات ما

مشتریان به طور فزاینده ای آنلاین هستند. تبلیغات می تواند به آنها کمک کند تا کسب و کار شما را پیدا کنند.

مشاهده بیشتر

تبلیغات ما

مشتریان به طور فزاینده ای آنلاین هستند. تبلیغات می تواند به آنها کمک کند تا کسب و کار شما را پیدا کنند.

مشاهده بیشتر

تبلیغات ما

مشتریان به طور فزاینده ای آنلاین هستند. تبلیغات می تواند به آنها کمک کند تا کسب و کار شما را پیدا کنند.

مشاهده بیشتر

تبلیغات ما

مشتریان به طور فزاینده ای آنلاین هستند. تبلیغات می تواند به آنها کمک کند تا کسب و کار شما را پیدا کنند.

مشاهده بیشتر

تبلیغات ما

مشتریان به طور فزاینده ای آنلاین هستند. تبلیغات می تواند به آنها کمک کند تا کسب و کار شما را پیدا کنند.

مشاهده بیشتر

توضیحات

A collection of our functions and classes from bootcamp.

ویژگی	مقدار
سیستم عامل	OS Independent
نام فایل	JMI-MVM-0.3.4
نام	JMI-MVM
نسخه کتابخانه	0.3.4
نگهدارنده	[]
ایمیل نگهدارنده	[]
نویسنده	James M. Irving, Michael V. Moravetz
ایمیل نویسنده	james.irving.phd@outlook.com
آدرس صفحه اصلی	https://github.com/jirvingphd/JMI_MVM
آدرس اینترنتی	https://pypi.org/project/JMI-MVM/
مجوز	-

# JMI_MVM - A collection of tools created for botcmap. - More information to be added later. <h1>Table of Contents<span class="tocSkip"></span></h1> <div class="toc"><ul class="toc-item"></ul></div> ```python name = "JMI_MVM" help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n" #functions.py import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns def calc_roc_auc(X_test,y_test,dtc,verbose=False): """Tests the results of an already-fit classifer. Takes X_test, y_test, classifer, verbose (True" print result) Returns the AUC for the roc_curve as a %""" y_pred = dtc.predict(X_test) FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred) roc_auc = auc(FP_rate,TP_rate) roc_auc_perc = round(roc_auc*100,3) # Your code here if verbose: print(f"roc_curve's auc = {roc_auc_perc}%") return roc_auc_perc def tune_params(param_name, param_values): """Takes in param_name to tune with param_values, plots train vs test AUC's. Returns df_results and df_style with color coded results""" res_list = [[param_name,'train_roc_auc','test_roc_auc']] # Loop through all values in param_values for value in param_values: # Create Model, set params dtc_temp = DecisionTreeClassifier(criterion='entropy') params={param_name:value} dtc_temp.set_params(**params) # Fit model dtc_temp.fit(X_train, y_train) # Get roc_auc for training data train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp) # Get roc_auc for test data test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp) # Append value and results to res_list res_list.append([value,train_roc_auc,test_res_roc_auc]) # Turn results into df_results (basically same as using list2df) df_results = pd.DataFrame(res_list[1:],columns=res_list[0]) df_results.set_index(param_name,inplace=True) # Plot df_results df_results.plot() # Color-coded dataframe s import seaborn as sns cm = sns.light_palette("green", as_cmap=True) df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max()) return df_results, df_syle # MULTIPLOT from string import ascii_letters import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt def multiplot(df): """Plots results from df.corr() in a correlation heat map for multicollinearity. Returns fig, ax objects""" sns.set(style="white") # Compute the correlation matrix corr = df.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(16, 16)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}) return f, ax # Plots histogram and scatter (vs price) side by side # Plots histogram and scatter (vs price) side by side def plot_hist_scat_sns(df, target='index'): """Plots seaborne distplots and regplots for columns im datamframe vs target. Parameters: df (DataFrame): DataFrame.describe() columns will be used. target = name of column containing target variable.assume first coluumn. Returns: Figures for each column vs target with 2 subplots. """ import matplotlib.ticker as mtick import matplotlib.pyplot as plt import seaborn as sns with plt.style.context(('dark_background')): ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------## # plt.style.use('dark_background') figsize=(9,7) # Axis Label fonts fontTitle = {'fontsize': 14, 'fontweight': 'bold', 'fontfamily':'serif'} fontAxis = {'fontsize': 12, 'fontweight': 'medium', 'fontfamily':'serif'} fontTicks = {'fontsize': 8, 'fontweight':'medium', 'fontfamily':'serif'} # Formatting dollar sign labels fmtPrice = '${x:,.0f}' tickPrice = mtick.StrMethodFormatter(fmtPrice) ### PLOTTING ----------------------------- ------------------------ ## # Loop through dataframe to plot for column in df.describe(): # print(f'\nCurrent column: {column}') # Create figure with subplots for current column fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2) ## SUBPLOT 1 --------------------------------------------------## i,j = 0,0 ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) # Define graphing keyword dictionaries for distplot (Subplot 1) hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'} kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"} # Plot distplot on ax[i,j] using hist_kws and kde_kws sns.distplot(df[column], norm_hist=True, kde=True, hist_kws = hist_kws, kde_kws = kde_kws, label=column+' histogram', ax=ax[i,j]) # Set x axis label ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) # Get x-ticks, rotate labels, and return xticklab1 = ax[i,j].get_xticklabels(which = 'both') ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0) ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-label ax[i,j].set_ylabel('Density',fontdict=fontAxis) yticklab1=ax[i,j].get_yticklabels(which='both') ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks) ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-grid ax[i, j].set_axisbelow(True) ax[i, j].grid(axis='y',ls='--') ## SUBPLOT 2-------------------------------------------------- ## i,j = 0,1 ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) # Define the kwd dictionaries for scatter and regression line (subplot 2) line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"} scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'} # Plot regplot on ax[i,j] using line_kws and scatter_kws sns.regplot(df[column], df[target], line_kws = line_kws, scatter_kws = scatter_kws, ax=ax[i,j]) # Set x-axis label ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) # Get x ticks, rotate labels, and return xticklab2=ax[i,j].get_xticklabels(which='both') ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0) ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) # Set y-axis label ax[i,j].set_ylabel(target,fontdict=fontAxis) # Get, set, and format y-axis Price labels yticklab = ax[i,j].get_yticklabels() ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks) ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) # ax[i,j].get_yaxis().set_major_formatter(tickPrice) # Set y-grid ax[i, j].set_axisbelow(True) ax[i, j].grid(axis='y',ls='--') ## ---------- Final layout adjustments ----------- ## # Deleted unused subplots fig.delaxes(ax[1,1]) fig.delaxes(ax[1,0]) # Optimizing spatial layout fig.tight_layout() figtitle=column+'_dist_regr_plots.png' # plt.savefig(figtitle) return # Tukey's method using IQR to eliminate def detect_outliers(df, n, features): """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe. Parameters: df (DataFrame): DataFrane containing columns of features n: default is 0, multiple outlier cutoff Returns: Index of outliers for .loc Examples: Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value df.loc[Outliers_to_drop] # Show the outliers rows data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) """ # Drop outliers outlier_indices = [] # iterate over features(columns) for col in features: # 1st quartile (25%) Q1 = np.percentile(df[col], 25) # 3rd quartile (75%) Q3 = np.percentile(df[col],75) # Interquartile range (IQR) IQR = Q3 - Q1 # outlier step outlier_step = 1.5 * IQR # Determine a list of indices of outliers for feature col outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index # append the found outlier indices for col to the list of outlier indices outlier_indices.extend(outlier_list_col) # select observations containing more than 2 outliers outlier_indices = Counter(outlier_indices) multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) return multiple_outliers # describe_outliers -- calls detect_outliers def describe_outliers(df): """ Returns a new_df of outliers, and % outliers each col using detect_outliers. """ out_count = 0 new_df = pd.DataFrame(columns=['total_outliers', 'percent_total']) for col in df.columns: outies = detect_outliers(df[col]) out_count += len(outies) new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)] new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])] return new_df #### Cohen's d def Cohen_d(group1, group2): '''Compute Cohen's d. # group1: Series or NumPy array # group2: Series or NumPy array # returns a floating point number ''' diff = group1.mean() - group2.mean() n1, n2 = len(group1), len(group2) var1 = group1.var() var2 = group2.var() # Calculate the pooled threshold as shown earlier pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) # Calculate Cohen's d statistic d = diff / np.sqrt(pooled_var) return d def plot_pdfs(cohen_d=2): """Plot PDFs for distributions that differ by some number of stds. cohen_d: number of standard deviations between the means """ group1 = scipy.stats.norm(0, 1) group2 = scipy.stats.norm(cohen_d, 1) xs, ys = evaluate_PDF(group1) pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7) xs, ys = evaluate_PDF(group2) pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7) o, s = overlap_superiority(group1, group2) print('overlap', o) print('superiority', s) def list2df(list):#, sort_values='index'): """ Take in a list where row[0] = column_names and outputs a dataframe. Keyword arguments: set_index -- df.set_index(set_index) sortby -- df.sorted() """ df_list = pd.DataFrame(list[1:],columns=list[0]) # df_list = df_list[1:] return df_list def df_drop_regex(DF, regex_list): '''Use a list of regex to remove columns names. Returns new df. Parameters: DF -- input dataframe to remove columns from. regex_list -- list of string patterns or regexp to remove. Returns: df_cut -- input df without the dropped columns. ''' df_cut = DF.copy() for r in regex_list: df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))] print(f'Removed {r}\n') return df_cut ####### MIKE's PLOTTING # plotting order totals per month in violin plots def make_violinplot(x,y, title=None, hue=None, ticklabels=None): '''Plots a violin plot with horizontal mean line, inner stick lines''' plt.style.use('dark_background') fig,ax =plt.subplots(figsize=(12,10)) sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True, saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title) ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow') ax.set_xticklabels(ticklabels) plt.legend() plt.show() x= df_year_orders['month'] y= df_year_orders['order_total'] title = 'Order totals per month with or without discounts' hue=df_year_orders['Discount']>0 ### Example usage # #First, declare variables to be plotted # x = df_year_orders['month'] # y = df_year_orders['order_total'] # ticks = [v for v in month_dict.values()] # title = 'Order totals per month with or without discounts' # hue = df_year_orders['Discount']>0 ### Then call function # make_violinplot(x,y,title,hue, ticks), ########### def plot_wide_kde_thin_bar(series1,sname1, series2, sname2): '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.''' ## ADDING add_gridspec usage import pandas as pd import numpy as np from scipy.stats import sem import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.ticker as ticker import seaborn as sns from matplotlib import rcParams from matplotlib import rc rcParams['font.family'] = 'serif' # Plot distributions of discounted vs full price groups plt.style.use('default') # with plt.style.context(('tableau-colorblind10')): with plt.style.context(('seaborn-notebook')): ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ## # Axis Label fonts fontSuptitle ={'fontsize': 22, 'fontweight': 'bold', 'fontfamily':'serif'} fontTitle = {'fontsize': 10, 'fontweight': 'medium', 'fontfamily':'serif'} fontAxis = {'fontsize': 10, 'fontweight': 'medium', 'fontfamily':'serif'} fontTicks = {'fontsize': 8, 'fontweight':'medium', 'fontfamily':'serif'} ## --------- CREATE FIG BASED ON GRIDSPEC --------- ## plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle) # Create fig object and declare figsize fig = plt.figure(constrained_layout=True, figsize=(8,3)) # Define gridspec to create grid coordinates gs = fig.add_gridspec(nrows=1,ncols=10) # Assign grid space to ax with add_subplot ax0 = fig.add_subplot(gs[0,0:7]) ax1 = fig.add_subplot(gs[0,7:10]) #Combine into 1 list ax = [ax0,ax1] ### ------------------ SUBPLOT 1 ------------------ ### ## --------- Defining series1 and 2 for subplot 1------- ## ax[0].set_title('Histogram + KDE',fontdict=fontTitle) # Group 1: data, label, hist_kws and kde_kws plotS1 = {'data': series1, 'label': sname1.title(), 'hist_kws' : {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5}, 'kde_kws': {'color':'gray', 'linestyle': '--', 'linewidth':2, 'label':'kde'}} # Group 2: data, label, hist_kws and kde_kws plotS2 = {'data': series2, 'label': sname2.title(), 'hist_kws' : {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5}, 'kde_kws': {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}} # plot group 1 sns.distplot(plotS1['data'], label=plotS1['label'], hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'], ax=ax[0]) # plot group 2 sns.distplot(plotS2['data'], label=plotS2['label'], hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'], ax=ax[0]) ax[0].set_xlabel(series1.name, fontdict=fontAxis) ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis) ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize']) ax[0].legend() ### ------------------ SUBPLOT 2 ------------------ ### # Import scipy for error bars from scipy.stats import sem # Declare x y group labels(x) and bar heights(y) x = [plotS1['label'], plotS2['label']] y = [np.mean(plotS1['data']), np.mean(plotS2['data'])] yerr = [sem(plotS1['data']), sem(plotS2['data'])] err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1} # Create the bar plot ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6) # Customize subplot 2 ax[1].set_title('Average Quantities Sold',fontdict=fontTitle) ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis) ax[1].set_xlabel('') ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize']) ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) ax1=ax[1] test = ax1.get_xticklabels() labels = [x.get_text() for x in test] ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center') # xlab = [x.get_text() for x in xlablist] # ax[1].set_xticklabels(xlab,rotation=45) # fig.savefig('H1_EDA_using_gridspec.png') # plt.tight_layout() # print(f') plt.show() return fig,ax ``` ```python ```

نیازمندی

مقدار	نام
-	numpy
-	pandas
-	seaborn
-	matplotlib
-	sklearn
-	pydotplus
-	scipy
-	xgboost

نحوه نصب

نصب پکیج whl JMI-MVM-0.3.4:

pip install JMI-MVM-0.3.4.whl

نصب پکیج tar.gz JMI-MVM-0.3.4:

pip install JMI-MVM-0.3.4.tar.gz