Bill Qualls: My Python Cheat Sheet




Install Spyder without Anaconda

(This is my preferred Python IDE, and I don't care for Anaconda for running production programs.)
  • First, install Python 3.7
  • Open command window (Run as Administrator)
  • cd "c:\Program Files\Python37\Scripts"    ← because pip.exe resides there
  • or cd "C:\Users\bill_\AppData\Local\Programs\Python\Python37\Scripts"    ← it might be there instead
  • pip install spyder    ← creates spyder3.exe in c:\Program Files\Python37\Scripts
  • This is also the easiest way to install other packages not found: pip install pkgname
  • Navigate to spyder3.exe and pin to task bar
  • Click icon on task bar to run



Start program

#==========================================================================
# Identification Division  LOL 
#==========================================================================
PROGRAM_ID   = "0101_preedits.py"
AUTHOR       = "Bill Qualls"
DATE_WRITTEN = "November 2019"
PROJECT_NAME = "NS Claims"

print("PROGRAM_ID   = " + PROGRAM_ID)
print("AUTHOR       = " + AUTHOR)
print("DATE_WRITTEN = " + DATE_WRITTEN)
print("PROJECT_NAME = " + PROJECT_NAME)
print()


#==========================================================================
# Initialize return code
#==========================================================================
RC = 0


#==========================================================================
# Start timer
#==========================================================================
import timeit                             # required for default_timer()
programStartTime = timeit.default_timer()


#==========================================================================
# Show command line arguments
#==========================================================================
import sys
i = 0
print("There are " + str(len(sys.argv)) + " command line arguments:")
for arg in sys.argv:
    print ("sys.argv[" + str(i) + "] = " + sys.argv[i])
    i = i + 1
print()
    

#==========================================================================
# Runtime options
#==========================================================================
DO_PICKLING  = True
DO_PROFILING = False



Imports

#==========================================================================
# Imports...
#==========================================================================
import pandas as pd
import pandas_profiling as pp
import numpy  as np
import datetime as dt
import os                                 # required for os.walk()
import sys                                # required for sys.path


try:
    import nscc                           # my shared code (constants and functions)                       
except:
    # This occurs when you use "Run selection" instead of press F5 to run all.
    if (os.getcwd() == "C:\\Users\\bill_"):    # Spyder default
        sys.path.append("D:\\AllFiles\\FA\\ns_claims\\source")
    import nscc


RUNTIME = str(dt.datetime.now())[0:19]                    # slice
print("RUNTIME      = " + RUNTIME)                        # 1984-07-23 12:34:56
RUNDATE  = RUNTIME[0:4] + RUNTIME[5:7] + RUNTIME[8:10]    # zero-based slicing: starts_at:ends_at_plus_one
print("RUNDATE      = " + RUNDATE)                        # 19840723




User name and host name

#==========================================================================
# Get username and hostname
#==========================================================================

import getpass
import socket

USERNAME = getpass.getuser()
HOSTNAME = socket.gethostname()



Some constants

#==========================================================================
# Some constants...
#==========================================================================
SEP = "\\"
DASHES = 60*"-"
EQUALS = 60*"="



File paths

#==========================================================================
# File paths...
#==========================================================================

PROJECT_FOLDER = r"F:\myclient"
RAWDATA_FOLDER               = PROJECT_FOLDER + SEP + "rawdata"
INBOUND_FOLDER               = PROJECT_FOLDER + SEP + "inbound"
OUTBOUND_FOLDER              = PROJECT_FOLDER + SEP + "outbound"
LOGS_FOLDER                  = PROJECT_FOLDER + SEP + "logs"
PICKLE_FOLDER                = PROJECT_FOLDER + SEP + "pickle"
PROFILING_FOLDER             = PROJECT_FOLDER + SEP + "profiling"

print("SEP                          = " + SEP)
print("PROJECT_FOLDER               = " + PROJECT_FOLDER)
print("RAWDATA_FOLDER               = " + RAWDATA_FOLDER)
print("INBOUND_FOLDER               = " + INBOUND_FOLDER)
print("OUTBOUND_FOLDER              = " + OUTBOUND_FOLDER)
print("LOGS_FOLDER                  = " + LOGS_FOLDER)
print("PICKLE_FOLDER                = " + PICKLE_FOLDER)
print("PROFILING_FOLDER             = " + PROFILING_FOLDER)



A quick test of Pandas profiling

#==========================================================================
# A quick test of pandas profiling...
#==========================================================================
if True:
    
    df = pd.DataFrame(data={'x': [1, 2, 3, 4, 5], 'y': [2, 2, 4, 6, 6], 'z': [4, 6, 1, 5, 2]})
    print(df.head(10))
    profile = pp.ProfileReport(df)
    profile.to_file(output_file=PROFILING_FOLDER + SEP + "pandas_profiling_demo.html")
    print("Done.")
    
else:
    
    print("")
    print("Skipped.")



My Pandas Profiling function

#==========================================================================
# My Pandas Profiling function...
#==========================================================================
def profileit(df, PROFILING_FILE):
    
    print("Begin profiling...")
    
    print("Size: " + str(df.shape[0]) + " rows x " + str(df.shape[1]) + " columns.")
    #print(df.head())
    #print(df.tail())

    # add a column called _DUMMY_ which has a value of 1 to prevent ZeroDivisionError.
    # this way I can force a profile all the time, even if no numeric fields.
    for idx, row in df.iterrows():
        df.at[idx, '_DUMMY_'] = 1

    try:
        profile = pp.ProfileReport(df)
        profile.to_file(output_file=PROFILING_FILE)
        print("Profile : " + PROFILING_FILE)
    except ZeroDivisionError:
        print("ZeroDivisionError occurred. This usually means a data file with no numeric fields")
        print("Profiling page not produced.")


# test function
profileit(df, PROFILING_FOLDER + SEP + "pandas_profiling_demo_from_function.html")



How I handle command line arguments in development

#==========================================================================
# Show command line arguments
# argv[0] is path to Optilytics folder. Example: E:\FK\Optilytics
# argv[1] is hospital name.             Example: Aspen
# argv[2] is optimization objective.    Example: 3
# argv[3] is recipient email address.   Example: bqualls@firstanalytics.com
#==========================================================================

i = 0
print()
myargv = sys.argv.copy()    # so I can provide defaults while in development

print("There are " + str(len(myargv)) + " command line arguments:")

# defaults
if (len(myargv) == 1):    # because the zeroth arg will always be there
    print(" Using default arguments.")
    myargv   = [
                    "D:\Optilytics"
                    , "Aspen"
                    , "3"
                    , "bqualls@firstanalytics.com"
               ]    

for arg in myargv:
    print("sys.argv[" + str(i) + "] = " + myargv[i])
    i = i + 1
print()

if (len(myargv) != 4):
    print(" There should be four command line arguments.")
    RC = 2
    terminateProgramWithID(RC, PROGRAM_ID)
    
OPTILYTICS_FOLDER   = myargv[0]
HOSPITAL_NAME       = myargv[1]
OBJECTIVE_NUMBER    = myargv[2]
EMAIL_RECIPIENT     = myargv[3]

print("OPTILYTICS_FOLDER = " + OPTILYTICS_FOLDER)
print("HOSPITAL_NAME     = " + HOSPITAL_NAME)
print("OBJECTIVE_NUMBER  = " + OBJECTIVE_NUMBER)
print("EMAIL_RECIPIENT   = " + EMAIL_RECIPIENT)
print()



Date conversions

# String "m([m]/d[d]/yyyy" becomes string "yyyy-mm-dd"
# Example: "6/1/1990" becomes "1990-06-01"
df["hiredate"] = df["Hired Date" ].apply(lambda x: pd.to_datetime(x).isoformat()[0:10])



Import from Excel

#==========================================================================
# Import client data from Excel...
#==========================================================================
if False:

    IMPORT_FILE = RAWDATA_FOLDER + SEP + "clientfile.xlsx"
    DTYPES = { 'Emp Nbr': str, 'Zip Code': str }
    DATE_COLS = [ 'Date' ]


    print("")
    print(DASHES)
    print("")

    print("Begin import " + IMPORT_FILE)

    startTime = timeit.default_timer()
    IMPORT_SHEET = "Sheet1"
    PICKLE_FILE = PICKLE_FOLDER + SEP + "myfile_imp.pickle"
    PROFILING_FILE = PROFILING_FOLDER + SEP + "myfile_imp.html"
    df = pd.read_excel(IMPORT_FILE, sheet_name=IMPORT_SHEET, dtype=DTYPES, parse_dates=DATE_COLS)
    print("Size: " + str(df.shape[0]) + " rows x " + str(df.shape[1]) + " columns.")
    stopTime = timeit.default_timer()
    elapsedTime = stopTime - startTime
    print("Elapsed time to import df was " + str(round(elapsedTime)) + " seconds.")

    startTime = timeit.default_timer()
    pd.to_pickle(df, PICKLE_FILE)
    stopTime = timeit.default_timer()
    elapsedTime = stopTime - startTime
    print("Elapsed time to pickle df was " + str(round(elapsedTime)) + " seconds.")

    startTime = timeit.default_timer()
    profileit(df, PROFILING_FILE)
    stopTime = timeit.default_timer()
    elapsedTime = stopTime - startTime
    print("Elapsed time to profile was " + str(round(elapsedTime)) + " seconds.")

    print("")
    print("Done.")
    
else:
    
    print("")
    print("Skipped.")



na vs. null vs. nan

Note according to https://datascience.stackexchange.com/questions/37878/difference-between-isna-and-isnull-in-pandas...
  1. To detect NaN values numpy uses np.isnan().
  2. To detect NaN values pandas uses either pd.isna(x) or .isnull().

# After importing from Excel you might need to delete junk rows with nan / nulls in the key column
df = df[ df["matter_number"].notna() ]


# Convert zip code (numeric with some na) to string with leading zeroes where needed
df["zip"] = df["Postal Code"].apply(lambda x: "" if pd.isna(x) else str(x).strip().zfill(5)[0:5])





Convert all column names in a Pandas dataframe to lowercase

df.columns = df.columns.str.lower()
...and replace or remove other characters:
df.columns = df.columns.str.lower().str.replace(' ', '_').  \
    str.replace('/', '_').str.replace('(', '').str.replace(')', '')



Show column dtypes

for col in df:
    if df[col].dtype.name != 'object':
        print('col', col)
        print('type', df[col].dtype.name)
        print()




Check dtype. Create string field with leading zeroes.

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

if is_numeric_dtype(df.eeid):
    df.eeid = df.eeid.apply(lambda x: str(x))
df.eeid = df.eeid.apply(lambda x: str(x).strip().zfill(7))   # 7 byte string with leading zero if needed




Rename a column in a Pandas dataframe

df.rename(columns={"Fra Reportable Status": "FRA Reportable Status"}, inplace=True)


df.rename(columns={"oldvar1": "newvar1", "oldvar2": "newvar2"}, inplace=True)



Create a dictionary from a pandas dataframe

df = pd.DataFrame({'id': ['6111A', '6222A', '6333A'], 'name': ['lawyer a', 'lawyer b', 'lawyer c']})
dict(df.values.tolist())

Result:
{'6111A': 'lawyer a', '6222A': 'lawyer b', '6333A': 'lawyer c'}




Beware the use of the name "name"
It will sometimes give you the index instead!

In [41]: df = pd.DataFrame({'name':["Henny", "Quentin", "Oscar"], 'age':[4, 3, 2]})
    ...: print(df)
    ...: print()
      name  age
0    Henny    4
1  Quentin    3
2    Oscar    2


In [42]: print(df.name)
    ...: print()
0      Henny
1    Quentin
2      Oscar
Name: name, dtype: object


In [43]: print(df['name'])
    ...: print()
0      Henny
1    Quentin
2      Oscar
Name: name, dtype: object


In [44]: for idx, row in df.iterrows():
    ...:     print(f"{row['name']} is {row['age']} years old.")
    ...: print()
Henny is 4 years old.
Quentin is 3 years old.
Oscar is 2 years old.


In [45]: for idx, row in df.iterrows():
    ...:     print(f"{row.name} is {row.age} years old.")
    ...: print()
0 is 4 years old.
1 is 3 years old.
2 is 2 years old.




Loop through dataframe rows.
Convert a single float to int, blank when zero.
Write HTML. Rows should fill available space.

import math       # for isnan()

htmlPath = HTML_FOLDER + SEP + "inventory_summary.html"
file1 = open(htmlPath,"w")

html = \
"""
<h3>Inventory Evaluation & Impact:</h3>
<table width="100%">
    <thead>
        <tr style="vertical-align:bottom">
            <th>Product</th>
            <th>Selected</th>
            <th>Location<br/>Count</th>
            <th>Inventory<br/>Today</th>
            <th>Capacity</th>
        </tr>
    </thead>
    <tbody>
"""

file1.write(html + "\n") 

for idx, row in df.iterrows():
    product              = row["product"]
    selected             = row["selected"]
    locations            = row["locations"]
    inventory            = row["inventory"]
    capacity             = row["capacity"]
    
    if selected == "Y":
        selected = "&#10004;"      # checkmark if Y
    else:
        selected = "";

    if math.isnan(locations):
        locations = "";
    else:
        locations = int(locations)
    
    if math.isnan(inventory):
        inventory = "";
    else:
        inventory = int(inventory)
    
    if math.isnan(capacity):
        capacity = "";
    else:
        capacity = int(capacity)

    html = \
        f'<tr>'                        + \
        f'<td>{product}lt;/td>'        + \
        f'<td>{selected}</td>'         + \
        f'<td>{locations}</td>'        + \
        f'<td>{inventory}</td>'        + \
        f'<td>{capacity}</td>'         + \
        f'</tr>'

    file1.write(html + "\n") 


html = \
"""
    </tbody>
</table>
</body>
</html>
"""

file1.write(html + "\n")

file1.close()

print("HTML output can be found at " + htmlPath)




Changing values in a dataframe with np.where

# From the docs: "Where cond is True, keep the original value. 
# Where False, replace with corresponding value from other."
# Makes me suspicious of this example...

# change "4 " to "04", "5 " to "05", "6 " to "06". Else no change.
df["TC"] = np.where(df["TC"] == "4", "04", df["TC"])
df["TC"] = np.where(df["TC"] == "5", "05", df["TC"])
df["TC"] = np.where(df["TC"] == "6", "06", df["TC"])






Encoding when reading csv

# Unable to read Payroll data as we normally would due to odd encoding.
# I would normally open with Notepad++ to convert encoding, but all but one file was
# too large to open with Notepad++. Actual encoding for that file was "UCS-2 LE BOM".
# To read these files with Pandas read_csv must use: encoding="utf_16_le"

DTYPES = { 'Cst Ctr': str, 'EE ID': str, 'Func': str, 'Natural Expns Cd': str, 'Tm Cd': str }
DATE_COLS = [ 'Dt Appr', 'Dt Wrked' ]

df = pd.read_csv(IMPORT_FILE, sep="\t", low_memory=False, encoding='utf_16_le',
        dtype=DTYPES, parse_dates=DATE_COLS)


		
# I was able to read a file whose encoding according to Notepad++ was "UCS-2 LE BOM" by using encoding="utf_16".



Read a pickled file

if True:
    
    WHICH_FILE = "payroll_2015.pickle"
    startTime = timeit.default_timer()
    df = pd.read_pickle(PICKLE_FOLDER + SEP + WHICH_FILE)
    stopTime = timeit.default_timer()
    elapsedTime = stopTime - startTime
    print("Elapsed time to read " + WHICH_FILE + " was " + str(round(elapsedTime)) + " seconds.")
    print("There are " + str(df.shape[0]) + " rows and " + str(df.shape[1]) + " columns.")
    print(df.info())
    print("Done.")

else:
    
    print("")
    print("Skipped.")



Read first n rows from a csv file

df = pd.read_csv(INBOUND_FOLDER + SEP + WHICH_FILE, nrows=500)



Write dataframe to csv

PATH = OUTBOUND_FOLDER + SEP + "combinations.csv"
print("Write csv file to " + PATH)
dfCombos.to_csv(PATH, index=False)



Concatenate multiple dataframes into a single dataframe

#==========================================================================
# Concatenate all payroll dataframes into a single dataframe
#==========================================================================
if False:
    
    df15 = pd.read_pickle(PICKLE_FOLDER + SEP + "payroll_2015.pickle")
    df16 = pd.read_pickle(PICKLE_FOLDER + SEP + "payroll_2016.pickle")
    df17 = pd.read_pickle(PICKLE_FOLDER + SEP + "payroll_2017.pickle")
    df18 = pd.read_pickle(PICKLE_FOLDER + SEP + "payroll_2018.pickle")
    df19 = pd.read_pickle(PICKLE_FOLDER + SEP + "payroll_2019.pickle")
    
    frames = [df15, df16, df17, df18, df19]
    df = pd.concat(frames, ignore_index=True)
    pd.to_pickle(df, PICKLE_FOLDER + SEP + "payroll_2015_2019.pickle")
    
    print("There are " + str(df.shape[0]) + " rows and " + str(df.shape[1]) + " columns.")
    print("")
    print(df.head())
    print("")
    print(df.tail())
    print("")
    
    print("Done.")
    
else:
    
    print("")
    print("Skipped.")



Save object other than dataframe to a pickle file

path = PICKLE_FOLDER + SEP + "Predict_Amt_Direct_Model.pickle.dat"
pickle.dump(thisXGBoost, open(path, "wb"))



Import fixed position file

#==========================================================================
# Import training
#==========================================================================
# The training file had tab-delimited header and fix-position fields.
# So I used Notepad++ to delete the header row, and added back bogus header.
# But then I decided to simply skip the header.
# It also uses encoding "UCS-2 LE BOM".

NAMES = [ "EMP_ID"
    , "PRSNTR_ID"
    , "TRNG_CD"
    , "TRNG_DEPT"
    , "TRND_DT"
    ]

COLSPECS = [ [1-1, 7]
    , [ 9-1, 15]
    , [17-1, 23]
    , [25-1, 27]
    , [29-1, 36]
    ]

DTYPES = { "EMP_ID": str 
    , "PRSNTR_ID"  : str
    , "TRNG_CD"    : str
    , "TRNG_DEPT"  : str }

DATE_COLS = [ 'TRND_DT' ]

IMPORT_FILE = RAWDATA_FOLDER + SEP + "training" + SEP + "TSAFE Training.txt"
PICKLE_FILE = PICKLE_FOLDER + SEP + "training.pickle"
PROFILING_FILE = PROFILING_FOLDER + SEP + "training.html"
df = pd.read_fwf(IMPORT_FILE, header=None, skiprows=1, encoding='utf_16_le', 
    colspecs=COLSPECS, names=NAMES, dtype=DTYPES, parse_dates=DATE_COLS)



Delete/keep rows in Pandas dataframe based on year of date

import datetime as dt

try:
    df["date"] = df["date"].apply(lambda x: str(x)[0:10])
    df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d", errors="coerce")
except:                                    # there are some 2999-mm-dd dates in there...
    df["date"] = pd.to_datetime("1900-01-01")
    
df = df[ df["date"].dt.year >= 1960 ]      # keep if year ge 1950 (to eliminate junk)



Here is an example of an apply function (not lambda) which returns more than one value to df

######################################################################################    
#   Here is an example of an apply function returning more than one value to df !!!  #
######################################################################################

ARGS_LIST  = ["var1", "var2",  "var3",  "var4", "src" ]

def myfunc(args):
    
    if np.isnan(args.amount):
        var1 = 1
    else:
        var1 = args.var1

    if args._merge == "left_only":
        var2  = args.x_var2
        var3  = args.x_var3
        var4  = args.x_var4
        src   = "X ONLY"
    elif args._merge == "right_only":
        var2  = args.y_var2
        var3  = args.y_var3
        var4  = args.y_var2           # var4 is not in Y file
        src   = "Y ONLY"
    elif args.x_var2 == args.y_var2:
        var2  = args.x_var2
        var3  = args.x_var3
        var4  = args.x_var4
        src   = "BOTH SAME"
    else:
        var2  = args.y_var2
        var3  = args.y_var3
        var4  = args.y_var2           # var4 is not in Y file
        src   = "BOTH DIFF"

    answer = pd.Series( [var1, var2, var3, var4, src] , index=ARGS_LIST)
    return answer

df2 = df.copy()   # to prevent a warning message about a dataframe modifying itself
df2[ ARGS_LIST ] = df.apply(myfunc, axis=1)
print(df2.src.value_counts())



"Master" and "Transaction"
Update/merge/insert one pandas dataframe with another.

# It took forever to get this right! There were a whole lot of "almost" solutions!
# Note "right" is the "transaction", "left" is the "old master"
# Note "left" comes before "right" when using combine_first
# Receiving dataaframe becomes the "new master"
# Will insert (add) rows in "right" (trans) not found in "left" (master)
# This behavior mimics a SAS' data step merge.
# see https://stackoverflow.com/questions/45796018/better-way-for-merge-update-insert-pandas-dataframes

df_old_master.set_index(BY_VARS, inplace=True)
df_trans.set_index(BY_VARS, inplace=True)
df_new_master = df_trans.combine_first(df_old_master).reset_index()



Create rows to be added / appended to a dataframe

all_rows = []
for idx, row in df.iterrows():
    row.pocket_begins_at = int(row.drawer_col_pos)
    row.pocket_ends_at   = int(row.drawer_col_pos + row.pocket_width - 1)
    for drawer_col_pos in range(pocket_begins_at, row.pocket_ends_at + 1):
        row.drawer_col_pos = drawer_col_pos
        all_rows.append(row.copy())
df = pd.DataFrame(all_rows, columns=df.columns)

# records were added so sort again
df.sort_values(by=BY_VARS, kind="mergesort", ignore_index=True, inplace=True)  



Get all files in directory

import os

for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.txt'):
            print(file)



Create a dataframe with a sorted list of all columns and their dtypes

dfTypes = df.dtypes
dfTypes = dfTypes.to_frame()
dfTypes.sort_index(inplace=True)
dfTypes.reset_index(inplace=True)
dfTypes.columns = ["variable", "dtype"]



lag (previous) and lead (next)

df = pd.DataFrame({'a':['v', 'w', 'x', 'y', 'z']})

df["lag"]  = df.a.shift(+1)    # previous value

df["lead"] = df.a.shift(-1)    # next value

df
Out[440]: 
   a  lag lead
0  v  NaN    w
1  w    v    x
2  x    w    y
3  y    x    z
4  z    y  NaN



Drop columns from a dataframe

DROPS = ["eeid", "date" ]
df.drop(DROPS, axis=1, inplace=True)

# one column, and only if it exists...
# errors="ignore" can be used for multiple columns as well
df.drop("eeid", axis=1, inplace=True, errors="ignore")



Drop columns EXCEPT one from a dataframe

DROP_COLUMNS = list(set(dfPART.columns) - set(["INCDNT_NUM__PART"]))   # all except INCDNT_NUM__PART
dfPART.drop(DROP_COLUMNS, axis=1, inplace=True)



Keep selected rows of a dataframe

dfPART = dfPART.loc[dfPART["DISP_CODE__PART"] == "2A"]    # keep rows with 2A only

# Reminder: compound conditions require parentheses or else you will get: # TypeError: cannot compare a dtyped [object] array with a scalar of type [bool] dfCLM1 = dfCLM1.loc[ (dfCLM1["CURR_STATUS__CLM1"] == "1") | (dfCLM1["CURR_STATUS__CLM1"] == "4") ]



Get current datetime as string "yyyy-mm-dd hh:mm:ss"

import datetime as dt

RUNTIME = str(dt.datetime.now())[0:19]
print("RUNTIME      = " + RUNTIME)



Extract month, day, and year from date in Pandas dataframe

dfCLM1["incident_year"]  = dfCLM1["DATE_INCDNT__CLM1"].apply(lambda x: int(x.strftime('%Y')))
dfCLM1["incident_month"] = dfCLM1["DATE_INCDNT__CLM1"].apply(lambda x: int(x.strftime('%m')))
dfCLM1["incident_day"]   = dfCLM1["DATE_INCDNT__CLM1"].apply(lambda x: int(x.strftime('%d')))



Remove apostrophes around string in Pandas dataframe

dfWFUT["INCDNT_NUM__WFUT"] = dfWFUT["INCDNT_NUM__WFUT"].apply(lambda x: x.strip("'"))



apply and lambda using multiple columns of Pandas dataframe as input

dfScored["Case_Stage"] = dfScored.apply(lambda x: "Direct" if ((x.COURT_JURIS__LEGL is None) &
        (x.PLAINT_ATTY__LEGL is None) & (x.DEF_ATTY__LEGL is None)) else "Litigation"
        , axis=1)


# create dummy variables for factors found in csr_entry
df["pi"]      = df["csr_entry"].apply(lambda x: 1 if str(x).lower().startswith("pi: ")      else 0)
df["mva"]     = df["csr_entry"].apply(lambda x: 1 if str(x).lower().startswith("mva: ")     else 0)
df["suspend"] = df["csr_entry"].apply(lambda x: 1 if str(x).lower().startswith("suspend: ") else 0)

# clear csr_entry while testing, to make it easier to find other factors
df["csr_entry"] = df.apply(lambda x: " " if x["pi"]      == 1 else x["csr_entry"], axis=1)    
df["csr_entry"] = df.apply(lambda x: " " if x["mva"]     == 1 else x["csr_entry"], axis=1)    
df["csr_entry"] = df.apply(lambda x: " " if x["suspend"] == 1 else x["csr_entry"], axis=1)    



Other examples of apply lambda

# the following required the str() function else get error message
# AttributeError: 'float' object has no attribute 'startswith'
df["pi"] = df["csr_entry"].apply(lambda x: 1 if str(x).startswith("pi: ") else 0)



Replace one value with another in Pandas dataframe

import pandas as pd
import numpy as np

# for column
df['column'] = df['column'].replace(np.nan, 0)

# inplace
df['column'].replace(np.nan, 0, inplace=True)

# for whole dataframe
df = df.replace(np.nan, 0)

# inplace
df.replace(np.nan, 0, inplace=True)

# replace all instances of one date value with another
df[DATE_COLS].replace(to_replace=FAKE_LOW_DATE__NEGO, value=FAKE_HIGH_DATE__NEGO, inplace=True)




Add sequence number by group

df['seq'] = df.groupby(['INCDNT_NUM__NEGO']).cumcount()

df['seq'] = df.groupby(['INCDNT_NUM__CLM1', 'INCDNT_TYPE']).cumcount()



Count of all values in a dataframe column (crude frequency distribution)

print(str(dfCLM1["IND_MODEL_TRAINING"].value_counts(dropna=False)))


print( pd.crosstab( df["DownTheSide"], df["AcrossTheTop'] ) )
print( pd.crosstab( df.DownTheSide, df.AcrossTheTop ) )



Sum dummy variables (0/1) to get count of 1's (crosstab of sorts...)

# sum dummy variables (0/1) to get count of 1's
xtab = df.groupby(["year"]).agg({
    'awardjob'   :'sum',
    'collision'  :'sum',
    'counsel'    :'sum',
    'cutoff'     :'sum',
    'discipline' :'sum',
    'disqual'    :'sum',
    'furloughed' :'sum',
    'hired'      :'sum',
    'newhire'    :'sum',
    'passed'     :'sum',
    'pi'         :'sum',
    'promoted'   :'sum',
    'reprimand'  :'sum',
    'serious'    :'sum',
    'sicklv'     :'sum',
    'suspend'    :'sum',
    'trained'    :'sum',
    'transfer'   :'sum',
    'warning'    :'sum' }).reset_index()



Frequency distribution (fd) of records per month yyyymm

# QC check added 20200908. Number of records by month.
# DT_WRKED is a datetime field, fd is a dataframe.
df["yyyymm"] = df['DT_WRKED'].dt.strftime('%Y%m')   
fd = df["yyyymm"].value_counts().rename_axis('month').reset_index(name='counts')
fd.sort_values(by=['month'], inplace=True)



Count of distinct values in a dataframe column

print("Number of distinct incidents: " + str(dfBinnedScores["INCDNT_NUM__CLM1"].nunique()))



Create a list of distinct values in a dataframe column

MECH_EEID_LIST = list( dfMech["eeid"].unique() )



Change dtype of a column of a dataframe from int to string

df["INJ_SEV__INJS"] = df["INJ_SEV__INJS"].apply(str)



String Interpolation / f-Strings (Python 3.6+)

firstName = "Julia"
lastName = "Qualls"
result = f'Hello, {firstName} {lastName}!'
print(result)



Concatenate / append / extend two lists

list1 = ["a", "b"]
list2 = ["c", "d"]

both=[]   # empty list
both.append(list1)   #  probably not what you want
both.append(list2)   # result is a list of lists
print(both)          # result is [['a', 'b'], ['c', 'd']]

both=[]   # empty list
both.extend(list1)
both.extend(list2)
print(both)          # result is ['a', 'b', 'c', 'd']



Write a column of a dataframe to a text or csv file

ids = df["INCDNT_NUM__CLM1"].unique()

file1 = open(OUTBOUND_FOLDER + SEP + "incdnt_nums.txt","w")
for id in ids:
    file1.write(id + "\n") 
file1.close()



Simple descriptive statistics for all columns in a Pandas dataframe

for clm in dfInvoices.columns:
	print("Column: " + clm)
	print(dfInvoices[clm].describe())
	print()



OutOfBoundsDatetime while using PandaSQL:
Out of bounds nanosecond timestamp: 3005-03-14 00:00:00


Strange error: I wasn't even referencing the date fields in question!
How error was diagnosed:

    dfInvoices["invoice_start_date"].unique()
    Out[28]: 
    array(['2011-05-23T00:00:00.000000000', '2011-05-16T00:00:00.000000000',
           '2005-03-01T00:00:00.000000000', ...,
           '2019-11-03T00:00:00.000000000', '2020-03-08T00:00:00.000000000',
           '2020-05-13T00:00:00.000000000'], dtype='datetime64[ns]')
    
    dfInvoices["invoice_date"].unique()
    Out[29]: 
    array([datetime.datetime(2011, 6, 30, 0, 0),
           datetime.datetime(2011, 6, 23, 0, 0),
           datetime.datetime(2005, 5, 11, 0, 0), ...,
           datetime.datetime(2003, 4, 9, 0, 0),
           datetime.datetime(2003, 5, 12, 0, 0),
           datetime.datetime(2020, 3, 28, 0, 0)], dtype=object)
	   
	   
The fix:

    # inconsistent datetime formats were causing an OutOfBoundsDatetime exception in SQL statement later
    DATE_COLS = [
      "invoice_date"
      , "invoice_start_date"
      , "invoice_end_date"
      ]
    
    for COL in DATE_COLS:
        dfInvoices[COL] = pd.to_datetime( dfInvoices[COL], errors="coerce" )




Get distinct values for a column in three different dataframes


I did this because PandaSQL does not support full outer join.
# get all unique matter_number from three different dataframes

nbrs = set()
nbrs.update( list( dfAgent["agent_matter_number"].unique()) )
nbrs.update( list(   dfDCA["dca_matter_number"  ].unique()) )
nbrs.update( list(    dfRM["rm_matter_number"   ].unique()) )
dfNbrs = pd.DataFrame( nbrs )
dfNbrs.columns = ['matter_number']




PANDASQL

import pandasql as ps           # required for ps.sqldf()

sql = """
   select 
     INCDNT_NUM__CLM1
     , CLMT_REF__CLM1
     , DATE_INCDNT__CLM1
          
     , DISP_CODE__SETT
     , DATE_SETTLED__SETT
     , AMOUNT__SETT
     , DATE_CLOSED__SETT
          
#    , INJ_RANK__INJS
#    , INJ_SEV__INJS
          
   from dfCLM1
   left join dfSETT   on INCDNT_NUM__CLM1 = INCDNT_NUM__SETT
#  left join dfINJS   on INCDNT_NUM__CLM1 = INCDNT_NUM__INJS
      
   order by INCDNT_NUM__CLM1
   """

import re      # required for regular expressions

sql = re.sub(r'#.*', '', sql)   # remove commented code (very cool!)
df = ps.sqldf(sql, locals())


# pandasql loses type on datetime variables
DATE_COLS = [
  "DATE_INCDNT__CLM1"
  , "DATE_SETTLED__SETT"
  , "DATE_CLOSED__SETT"
  ]

for COL in DATE_COLS:
    df[COL] = pd.to_datetime( df[COL], errors='coerce' )  # errors become NaT




Number of days between two datetime columns...

import pandasql as ps

sql = """
   select a.eeid
     , a.date
     , a.pr_tc_holiday
     , min(b.date)            as date_next_holiday
     , max(c.date)            as date_prev_holiday
   from df as a
   left join df as b
      on b.pr_tc_holiday = 1
      and a.eeid = b.eeid
      and a.date < b.date
   left join df as c
      on c.pr_tc_holiday = 1
      and a.eeid = c.eeid
      and a.date > c.date
   group by a.eeid
   , a.date
   , a.pr_tc_holiday
   order by a.eeid
   , a.date
   """

print("Begin SQL join")
res = ps.sqldf(sql, locals())
print("SQL join done.")
print()

print("Fix date columns")
DATE_COLS = [ "date", "date_next_holiday", "date_prev_holiday" ]

for COL in DATE_COLS:
    res[COL] = pd.to_datetime( res[COL] )
    
sql = """
   select eeid
     , date
     , pr_tc_holiday
     , date_next_holiday
     , ( strftime('%s',date_next_holiday) - strftime('%s',date) )
         / (24*60*60)                        as days_until_next_holiday
     , date_prev_holiday
     , ( strftime('%s',date) - strftime('%s',date_prev_holiday) )
         / (24*60*60)                        as days_since_prev_holiday
   from res
   order by eeid
   , date
   """

print("Begin SQL query")
res2 = ps.sqldf(sql, locals())
print("SQL query done.")
print()

print("Fix date columns")
DATE_COLS = [ "date", "date_next_holiday", "date_prev_holiday" ]

for COL in DATE_COLS:
    res2[COL] = pd.to_datetime( res2[COL] )
	
	
	
	
# To assign missing value / NaN / None to a column, use null.



My function to fix all datetime columns following a SQL query...

import pandas as pd

# you will need to create list ALL_DATE_COLS with names of date columns.

def fix_dates_after_sql(df):
    # pandasql loses type on datetime variables
    for COL in ALL_DATE_COLS:
        if COL in df.columns:
            df[COL] = pd.to_datetime(df[COL])



Select record with most recent date / drop duplicates / dedupe

print()
firstlog.skip()
print(f"Before selecting most recent date, there are {dfCourt.shape[0]} rows x {dfCourt.shape[1]} columns," +
      f" with {dfCourt.shape[0] - dfCourt['matter_number'].nunique()} duplicate matter_number.")
firstlog.message(f"Before selecting most recent date, there are {dfCourt.shape[0]} rows x {dfCourt.shape[1]} columns," +
      f" with {dfCourt.shape[0] - dfCourt['matter_number'].nunique()} duplicate matter_number.")

sql = """
    select a.*
    from dfCourt as a
    , (
           select matter_number
           , max(file_date) as most_recent_file_date
           from dfCourt
           group by matter_number
      ) as b
    where a.matter_number    = b.matter_number
      and a.file_date        = b.most_recent_file_date
      
    """

sql = re.sub(r'#.*', '', sql)   # remove commented code (very cool!)
dfCourtMostRecent = ps.sqldf(sql, locals())

print(f"After selecting most recent date, there are {dfCourtMostRecent.shape[0]} rows x {dfCourtMostRecent.shape[1]} columns," +
      f" with {dfCourtMostRecent.shape[0] - dfCourt['matter_number'].nunique()} duplicate matter_number.")
firstlog.message(f"After selecting most recent date, there are {dfCourtMostRecent.shape[0]} rows x {dfCourtMostRecent.shape[1]} columns," +
      f" with {dfCourtMostRecent.shape[0] - dfCourt['matter_number'].nunique()} duplicate matter_number.")
print()
firstlog.skip()

# one dupe left...
dfCourtMostRecent.drop_duplicates(subset="matter_number", keep="first", inplace=True)

print(f"After drop_duplicates, there are {dfCourtMostRecent.shape[0]} rows x {dfCourtMostRecent.shape[1]} columns," + 
      f" with {dfCourtMostRecent.shape[0] - dfCourt['matter_number'].nunique()} duplicate matter_number.")
firstlog.message(f"After drop_duplicates, there are {dfCourtMostRecent.shape[0]} rows x {dfCourtMostRecent.shape[1]} columns," +
      f" with {dfCourtMostRecent.shape[0] - dfCourt['matter_number'].nunique()} duplicate matter_number.")
print()
firstlog.skip()




Normal end of program

#==========================================================================
# Normal end of program.
#==========================================================================

programStopTime = timeit.default_timer()
programElapsedTime = programStopTime - programStartTime

print()
print()
print(DASHES)
print("Normal end of program " + PROGRAM_ID + " (RC=" + str(RC) + ")" )
print("Elapsed time for this program was " + str(round(programElapsedTime / 60)) + " minutes.")
print(DASHES)
print()




terminateProgram function

(I felt the need for something callable.)
#==========================================================================
# Terminate program
# try/except is used to avoid exception on sys.exit()
#==========================================================================
import sys

def terminateProgram(RC):

    if (RC == 0):
        print("Normal end of program (RC=0).")
    else:
        print("Abnormal end of program (RC=" + str(RC) + ").")
        
    try:
        sys.exit(RC)
    except SystemExit:
        pass
		
def terminateProgramWithID(RC, PROGRAM_ID):

    if (RC == 0):
        print("Normal end of program " + PROGRAM_ID + " (RC=0).")
    else:
        print("Abnormal end of program " + PROGRAM_ID + " (RC=" + str(RC) + ").")
        
    try:
        sys.exit(RC)
    except SystemExit:
        pass




Generate string of random characters

#==========================================================================
# Generate 40 digit random string. 
# Example: 7wrnrJiyFDo8A39KAWoc56cUX8UlaKX4JU2Ac8V6
#==========================================================================

# string.ascii_letters gives mixed case
# can also use string.ascii_lowercase or string.ascii_uppercase

import random
import string
random = ''.join([random.choice(string.ascii_letters + string.digits) \
                  for n in range(40)])
print(random)




This was a workaround when a PandaSQL left join of many tables failed due to memory

print("\nBegin with CLM1...it **MUST** be first...")
joined = dfCLM1.set_index("INCDNT_NUM__CLM1")
dfCLM1 = -1    # free up memory
print (f"So far we have {joined.shape[0]} rows x {joined.shape[1]} clms.")


print("\nNow join GCIR...")
left  = joined.copy()
right = dfGCIR.set_index("INCDNT_NUM__GCIR")
dfGCIR = -1    # free up memory
joined = pd.merge(left, right, how='left', left_index=True, right_index=True)
print (f"So far we have {joined.shape[0]} rows x {joined.shape[1]} clms.")


print("\nNow join LEGL...")
left  = joined.copy()
right = dfLEGL.set_index("INCDNT_NUM__LEGL")
dfLEGL = -1    # free up memory
joined = pd.merge(left, right, how='left', left_index=True, right_index=True)
print (f"So far we have {joined.shape[0]} rows x {joined.shape[1]} clms.")
    

print("\nNow join SETT...")
left  = joined.copy()
right = dfSETT.set_index("INCDNT_NUM__SETT")
dfSETT = -1    # free up memory
joined = pd.merge(left, right, how='left', left_index=True, right_index=True)
print (f"So far we have {joined.shape[0]} rows x {joined.shape[1]} clms.")




MongoDB

#==========================================================================
# MongoDB constants...
#==========================================================================

MONGO_DATABASE_NAME   = 'xxxxxxxx'
MONGO_COLLECTION_HOSPITALS = 'hospitals'
MONGO_COLLECTION_PRODUCTS  = 'products'



#==========================================================================
# Mongo DB settings...
#==========================================================================

client     = MongoClient()  # connect to Mongo
db         = client[MONGO_DATABASE_NAME]               # database
hospColl   = db[MONGO_COLLECTION_HOSPITALS]            # hospitals collection
simpColl   = db[MONGO_COLLECTION_PRODUCTS]             # products collection

logIt("Mongo database name is " + MONGO_DATABASE_NAME)
logIt("Mongo hospitals collection name is " + MONGO_COLLECTION_HOSPITALS)
logIt("Mongo products collection name is "  + MONGO_COLLECTION_PRODUCTS)
logIt("")

logIt('DB access is read only.')
logIt("")



#--------------------------------------------------------------------------
# Retrieve entire products collection from Mongo.
# It will be accessed by medID as the index.
#--------------------------------------------------------------------------
        
logIt("Retrieving products...")
productList = list(db.products.find())
dfProduct = pd.DataFrame(productList)
dfProduct.set_index([ 'medID' ], inplace=True)
fstring = f"dfProduct has {dfProduct.shape[0]} rows and {dfProduct.shape[1]} columns."
logIt(fstring)
logIt("")



#--------------------------------------------------------------------------
# Retrieve hospital document
#--------------------------------------------------------------------------
        
logIt("Retrieving document for hospital: " + HOSPITAL_NAME)
query = { "hospitalName" : HOSPITAL_NAME }
doc = hospColl.find_one(query)



# Some other stuff happened here



#--------------------------------------------------------------------------
# Loop thru meds writing to replacement meds file.
#--------------------------------------------------------------------------


targetMedsList = doc["targetMeds"]     # retrieve list from MongoDB document

r = 0

for dict in targetMedsList:            # each entry in list is a dictionary

    medID               = dict["medID"]
    medClass            = dict["medClass"]
    medDesc             = dict["medDesc"]
    medClassMatch       = dict["medClassMatch"]
    drugStrengthVol     = dict["drugStrengthVol"]
    drugStrengthUOM     = dict["drugStrengthUOM"]
    fillVol             = dict["fillVol"]
    fillUOM             = dict["fillUOM"]
    package             = dict["package"]
    stationCount        = dict["stationCount"]
    matchingNDC         = dict["matchingNDC"]
    overrideNDC         = dict["overrideNDC"]
    action              = dict["action"]
    useInTargets        = dict["useInTargets"]
    useInExisting       = dict["useInExisting"]
    useInReplacements   = dict["useInReplacements"]

    useThisNDC = overrideNDC if overrideNDC else matchingNDC


    if (useInReplacements == True):
	
        r = r + 1
        
        unmappedDrugName   = dfProduct.loc[useThisNDC]["unmappedDrugName"]
        drugStrengthVol    = dfProduct.loc[useThisNDC]["drugStrengthVol" ]
        drugStrengthUOM    = dfProduct.loc[useThisNDC]["drugStrengthUOM" ]
        fillVol            = dfProduct.loc[useThisNDC]["fillVol"         ]
        fillUOM            = dfProduct.loc[useThisNDC]["fillUOM"         ]

        ndcDesc = f"{unmappedDrugName} {drugStrengthVol}{drugStrengthUOM} {fillVol}{fillUOM}"

        fstring = f"/* {60*'-'} "
        replacementFile.write(fstring + nl)
        fstring = f" * {action:7} : {medDesc} "
        replacementFile.write(fstring + nl)
        fstring = f" * {'with':7} : {ndcDesc} "
        replacementFile.write(fstring + nl)
        fstring = f" * {60*'-'} */"
        replacementFile.write(fstring + nl + nl)
        fstring = f"medID           = \"{medID}\" ; "
        replacementFile.write(fstring + nl)
        fstring = f"NDC             = \"{useThisNDC}\" ; "
        replacementFile.write(fstring + nl)
        fstring = f"isOverrideNDC   = {1 if overrideNDC else 0:1} ; "
        replacementFile.write(fstring + nl)
        fstring = f"isTarget        = {1 if action in ('replace', 'exists') else 0:1} ; "
        replacementFile.write(fstring + nl)
        fstring = f"isExisting      = {1 if action == 'exists'  else 0:1} ; "
        replacementFile.write(fstring + nl)
        fstring = f"output ; "
        replacementFile.write(fstring + nl + nl)

       
if (r == 0):
    fstring = f"delete;    * do not write an empty record ; "
    replacementFile.write(fstring + nl + nl)
        
fstring = f"run;"
replacementFile.write(fstring + nl)
   
replacementFile.close()

logIt("")
logIt(str(  r  ) + " products written to " + replacementPath)
logIt("")



THE ABOVE CODE CREATED THIS SAS CODE:

* This is D:\Optilytics\hospitals\Baptist_Lexington\input\create_replacement_medids.txt ;
* This file created by get_target_and_existing_meds_from_mongo.py at 20200604_141329 ;

data &TBL_REPLACEMENT_MEDIDS ;

attrib medID           length=$15 ;
attrib NDC             length=$15 ;
attrib isOverrideNDC   length=  8 ;

/* ------------------------------------------------------------ 
 * replace : betamethasone a 6mg/1mL 5mL vial 
 * with    : SPECIAL Dexamethasone 10.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "580213" ; 
NDC             = "NDC76045-109-10" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : dexamethasone ( 4mg/1mL 1mL soln 
 * with    : SPECIAL Morphine 5.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "580578" ; 
NDC             = "NDC76045-006-10" ; 
isOverrideNDC   = 1 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : diphenhydrAMIN 50mg/1mL 1mL soln 
 * with    : SPECIAL Diphenhydramine 50.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "580658" ; 
NDC             = "NDC76045-102-10" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : heparin s 5000units/1mL 1mL soln 
 * with    : SPECIAL Heparin 5000.0UNITS 1.0mL 
 * ------------------------------------------------------------ */

medID           = "510096" ; 
NDC             = "NDC76045-108-10" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * exists  : HYDROmorphone ( 2mg/1mL 1mL soln 
 * with    : SPECIAL Dilaudid MV 2.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "409252" ; 
NDC             = "NDC76045-010-11" ; 
isOverrideNDC   = 1 ; 
isTarget        = 1 ; 
isExisting      = 1 ; 
output ; 

/* ------------------------------------------------------------ 
 * exists  : HYDROmorphon 1mg/1mL 1mL solutio 
 * with    : SPECIAL Dilaudid MV 1.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "531031" ; 
NDC             = "NDC76045-009-11" ; 
isOverrideNDC   = 1 ; 
isTarget        = 1 ; 
isExisting      = 1 ; 
output ; 

/* ------------------------------------------------------------ 
 * exists  : HYDROmorphon 2mg/1mL 1mL solutio 
 * with    : SPECIAL Dilaudid MV 2.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "531269" ; 
NDC             = "NDC76045-010-11" ; 
isOverrideNDC   = 1 ; 
isTarget        = 1 ; 
isExisting      = 1 ; 
output ; 

/* ------------------------------------------------------------ 
 * exists  : HYDROm 0.5mg/0.5mL 0.5mL syringe 
 * with    : SPECIAL Dilaudid MV 0.5MG 0.5mL 
 * ------------------------------------------------------------ */

medID           = "531517" ; 
NDC             = "NDC76045-000-96" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 1 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : ketorolac (INJ 15mg/1mL 1mL soln 
 * with    : SPECIAL Ketorolac 15.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "530409" ; 
NDC             = "NDC76045-107-10" ; 
isOverrideNDC   = 1 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : ketorolac (INJ 30mg/1mL 1mL soln 
 * with    : SPECIAL Ketorolac 30.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "530410" ; 
NDC             = "NDC76045-104-10" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : metoclopramide 5mg/1mL 2mL soln 
 * with    : SPECIAL Metoclopramide 10.0MG 2.0mL 
 * ------------------------------------------------------------ */

medID           = "581457" ; 
NDC             = "NDC76045-101-20" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : midazolam (PF) 1mg/1mL 2mL soln 
 * with    : SPECIAL Midazolam 2.0MG 2.0mL 
 * ------------------------------------------------------------ */

medID           = "410892" ; 
NDC             = "NDC76045-001-20" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : midazolam (INJ) 1mg/1mL 2mL soln 
 * with    : SPECIAL Midazolam 2.0MG 2.0mL 
 * ------------------------------------------------------------ */

medID           = "530557" ; 
NDC             = "NDC76045-001-20" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

/* ------------------------------------------------------------ 
 * exists  : morphine 2mg/1mL 1mL syringe 
 * with    : SPECIAL Morphine 2.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "530581" ; 
NDC             = "NDC76045-004-10" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 1 ; 
output ; 

/* ------------------------------------------------------------ 
 * exists  : morphine (IV) 4mg/1mL 1mL soln 
 * with    : SPECIAL Morphine 4.0MG 1.0mL 
 * ------------------------------------------------------------ */

medID           = "530582" ; 
NDC             = "NDC76045-005-10" ; 
isOverrideNDC   = 1 ; 
isTarget        = 1 ; 
isExisting      = 1 ; 
output ; 

/* ------------------------------------------------------------ 
 * replace : ondansetron (IN 2mg/1mL 2mL soln 
 * with    : SPECIAL Ondansetron 4.0MG 2.0mL 
 * ------------------------------------------------------------ */

medID           = "898102" ; 
NDC             = "NDC76045-103-20" ; 
isOverrideNDC   = 0 ; 
isTarget        = 1 ; 
isExisting      = 0 ; 
output ; 

run;



Object Oriented Python: __repr__ is same as Java toString()? (Haven't tried this yet.)

# See http://zetcode.com/python/fstring/

class User:
    def __init__(self, name, occupation):
        self.name = name
        self.occupation = occupation

    def __repr__(self):
        return f"{self.name} is a {self.occupation}"

u = User('John Doe', 'gardener')

print(f'{u}')



Rolling totals / cumulative sum

(And an esier way to create a dataframe from what I used to call instream data.)
# See https://gist.github.com/gjreda/ecdd49d133e09eee3264
# Modified to show how to choose the column(s) you want.

import pandas as pd
from io import StringIO

data = """
id,day,a,b,c
111,1,NaN,1,11
111,2,0,2,12
222,1,5,3,13
222,2,33,4,14
222,3,32,5,15
444,1,104,6,16
444,2,5,7,17
555,1,0,8,18
"""

df = pd.read_csv(StringIO(data))
GROUP_BY = ['id', 'day']
cumsums = df.groupby(GROUP_BY).sum().fillna(0).groupby(level=0).cumsum()
df.set_index(GROUP_BY, inplace=True)
df['suma'] = cumsums["a"]
df['sumc'] = cumsums["c"]
df.reset_index(inplace=True)
print(df)

#     id  day      a  b   c   suma  sumc
# 0  111    1    NaN  1  11    0.0    11
# 1  111    2    0.0  2  12    0.0    23
# 2  222    1    5.0  3  13    5.0    13
# 3  222    2   33.0  4  14   38.0    27
# 4  222    3   32.0  5  15   70.0    42
# 5  444    1  104.0  6  16  104.0    16
# 6  444    2    5.0  7  17  109.0    33
# 7  555    1    0.0  8  18    0.0    18



Copyright © 2020 by Bill Qualls