Ksama Arora

# Azure Machine Learning workspace details:
subscription = '<subscription_id>'
resource_group = '<resource_group>'
workspace = '<workspace>'
datastore_name = '<datastore>'
path_on_datastore = '<path>'

# Long-form Datastore URI format:
uri = f'azureml://subscriptions/{subscription}/resourcegroups/{resource_group}/workspaces/{workspace}/datastores/{datastore_name}/paths/{path_on_datastore}'

OR go to Data -> specific Datastore -> select csv file -> copy URI (Datastore URI or Storage URI)

# Import pandas library
import pandas as pd

# Populate dataframe "my_dataframe" using the pandas read CSV method by passing in the URI acquired
my_dataframe = pd.read_csv("URI")

# Then run dataframe head passing in a value for the number of rows you want to return
my_dataframe.head(1000)
# Fill missing values in the "Claim Network Status" column with "Unkown" and update the dataframe in place
my_dataframe.fillna(
    value={"Claim Network Status": "Unkown"}, inplace=True)

# Fill missing values in the "Payment Status" column with "Unkown" and update the dataframe in place
my_dataframe.fillna(
    value={"Payment Status": "Unkown"}, inplace=True)

# Return the first 1000 rows of the dataframe
my_dataframe.head(1000)
# Drop rows with any missing values and update the dataframe in place
my_dataframe.dropna(inplace=True)

# Return the first 1000 rows of the dataframe
my_dataframe.head(1000)

Wrangling Data with Apache Spark

# Import pyspark pandas library
import pyspark.pandas as pd
my_dataframe = pd.read_csv("URI")
my_dataframe.head(1000)
# Drop duplicate rows and update the dataframe in place
my_dataframe.drop_duplicates(inplace=True)

# Sort the dataframe by its index and update the dataframe in place
my_dataframe.sort_index(inplace=True)

# Return the first 1000 rows of the dataframe
my_dataframe.head(1000)
# Save the dataframe to a CSV file at the specified Azure Data Lake Storage path
my_dataframe.to_csv(
    "abfss://<FILE_SYSTEM_NAME>@<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net/data/wrangled"
    )