diff --git a/R/AzureHDI.R b/R/AzureHDI.R index 8ebaf0d..bd5bc61 100644 --- a/R/AzureHDI.R +++ b/R/AzureHDI.R @@ -258,7 +258,7 @@ azureCreateHDI <- function(azureActiveContext, resourceGroup, location, #' @family HDInsight functions #' @export azureResizeHDI <- function(azureActiveContext, clustername, - role = c("worker", "head", "edge"), + role = c("workernode", "headnode", "edgenode"), size = 2, mode = c("Sync", "Async"), subscriptionID, resourceGroup, verbose = FALSE) { @@ -267,7 +267,7 @@ azureResizeHDI <- function(azureActiveContext, clustername, assert_that(is_resource_group(resourceGroup)) assert_that(is_clustername(clustername)) - assert_that(is.integer(size)) + assert_that(is.integer(as.integer(size))) role <- match.arg(role) mode <- match.arg(mode) diff --git a/R/AzureHive.R b/R/AzureHive.R index fdae837..3aca7ef 100644 --- a/R/AzureHive.R +++ b/R/AzureHive.R @@ -30,14 +30,18 @@ azureHiveStatus <- function(azureActiveContext, clustername, hdiAdmin, if (!length(HP)) { stop("Error: No Valid hdiPassword provided") } - + verbosity <- set_verbosity(verbose) + azureActiveContext$hdiAdmin <- HA azureActiveContext$hdiPassword <- HP azureActiveContext$clustername <- CN - + cat(HA) + cat(HP) + uri <- paste0("https://", CN, ".azurehdinsight.net/templeton/v1/status") - r <- GET(URL, add_headers(.headers = c(`Content-type` = "application/json")), + cat(uri) + r <- GET(uri, add_headers(.headers = c(`Content-type` = "application/json")), authenticate(HA, HP), verbosity) if (status_code(r) != 200 && status_code(r) != 201) { stop(paste0("Error: Return code(", status_code(r), ")")) @@ -130,9 +134,9 @@ azureHiveSQL <- function(azureActiveContext, CMD, clustername, hdiAdmin, if (DUR < 5) DUR <- DUR + 1 if (df$status$state == "PREP") - message("P") + message("P",appendLF = FALSE) if (df$status$state == "RUNNING") - message("R") + message("R",appendLF = FALSE) # print(df$status$state) r <- GET(URL, add_headers(.headers = c(`Content-type` = "application/json")), @@ -142,9 +146,9 @@ azureHiveSQL <- function(azureActiveContext, CMD, clustername, hdiAdmin, df <- fromJSON(rl) } if (df$status$state == "SUCCEEDED") - message("S") + message("S",appendLF = FALSE) if (df$status$state == "FAILED") - message("F") + message("F",appendLF = FALSE) STATE <- df$status$state message("Finished Running statement: ", Sys.time()) diff --git a/R/AzureSpark.R b/R/AzureSpark.R index 8b4483e..4799552 100644 --- a/R/AzureSpark.R +++ b/R/AzureSpark.R @@ -276,13 +276,17 @@ azureSparkCMD <- function(azureActiveContext, CMD, clustername, hdiAdmin, sep = "") # print(URL) message(paste("CMD Running: ", Sys.time())) - message("Running(R), Completed(C)") + message("Running(R) Waiting(W) Completed(C)") - while (df$state == "running") { + while (df$state == "running" || df$state == "waiting") { Sys.sleep(DUR) if (DUR < 5) DUR <- DUR + 1 - message("R") + if (df$state == "running") + message("R",appendLF = FALSE) + if (df$state == "waiting") + message("W",appendLF = FALSE) + r <- GET(URL, add_headers(.headers = c(`Content-type` = "application/json")), authenticate(HA, HP)) rl <- content(r, "text", encoding = "UTF-8") @@ -290,7 +294,7 @@ azureSparkCMD <- function(azureActiveContext, CMD, clustername, hdiAdmin, df <- fromJSON(rl) } - message("C") + message("C",appendLF = FALSE) message("Finished Running statement: ", Sys.time()) RET <- df$output$data[1] # rownames(RET) <- 'Return Value' diff --git a/README.md b/README.md index 9528362..1bc585b 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ To get started with this package, see the vignettes: - * [Tutorial](http://htmlpreview.github.io/?https://github.com/Microsoft/AzureSMR/blob/master/vignettes/tutorial.html) - * [Getting Authenticated](http://htmlpreview.github.io/?https://github.com/Microsoft/AzureSMR/blob/master/vignettes/Authentication.html) + * [Tutorial](http://htmlpreview.github.io/?https://github.com/Microsoft/AzureSMR/blob/master/inst/doc/tutorial.html) + * [Getting Authenticated](http://htmlpreview.github.io/?https://github.com/Microsoft/AzureSMR/blob/master/inst/doc/Authentication.html) To access the package help, just type `?AzureSMR` into your code editor. diff --git a/inst/examples/example_azureDeleteHDI.R b/inst/examples/example_azureDeleteHDI.R new file mode 100644 index 0000000..300ba55 --- /dev/null +++ b/inst/examples/example_azureDeleteHDI.R @@ -0,0 +1,5 @@ +\dontrun{ +library(AzureSMR) + +azureDeleteHDI(asc, clustername = "azuresmrclustername") +} diff --git a/vignettes/Authentication.R b/vignettes/Authentication.R deleted file mode 100644 index d33720d..0000000 --- a/vignettes/Authentication.R +++ /dev/null @@ -1,4 +0,0 @@ -## ---- eval = FALSE------------------------------------------------------- -# sc <- createAzureContext(tenantID = "{TID}", clientID = "{CID}", authKey= "{KEY}") -# rgs <- azureListRG(sc) -# rgs diff --git a/vignettes/Authentication.Rmd b/vignettes/Authentication.Rmd index a85150f..1deae5f 100644 --- a/vignettes/Authentication.Rmd +++ b/vignettes/Authentication.Rmd @@ -47,7 +47,7 @@ To apply access control azToken Resource Group 16. Identify the resource group you will associate with this application. -17. Choose the Users menu item from the Resource scope. +17. Choose the Access Control(IAM) menu item from the Resource scope. 18. In the resulting scope click the `+ Add` button. @@ -62,7 +62,7 @@ Alternatively you can access control azToken Subscription Level 16. Identify the Subscription you will associate with this application. -17. Choose the Users(access) menu item. +17. Choose the Access Control(IAM) menu item. 18. In the resulting scope click the + Add button. @@ -71,7 +71,6 @@ Alternatively you can access control azToken Subscription Level 20. Select the resulting list item for that App then click Select in that scope then OK in the "Add access" scope. The user will be added to the list. - That is all. You can test this by trying: ```{r, eval = FALSE} diff --git a/vignettes/Authentication.html b/vignettes/Authentication.html deleted file mode 100644 index 87aad75..0000000 --- a/vignettes/Authentication.html +++ /dev/null @@ -1,136 +0,0 @@ - - - - -
- - - - - - - - - - - -Follow these instructions to create an active directory. You will need to collect the tenant ID (tenantID
), client ID (clientID
) and authentication key (authKey
) in order to authenticate with the createAzureContect()
function.
Login to the Classic (i.e., the old) Portal https://manage.windowsazure.com/.
On the left hand menu you should see amongst all the items one called ACTIVE DIRECTORY. Click the item and an active directory DIRECTORY will be listed.
Click on an entry under the Name column (if there is only one entry the choice is easy!) to take you to a page of options to get started with some text like I WANT TO.
Along the top menu click APPLICATIONS.
You probably want to create a new application so type a name for it in the Search box (I called mine AzureSMR). The search result will come back with no results and a button that says ADD AN APPLICATION
-> which you should click.
Give the application a name and choose WEB APPLICATION AND/OR WEB API
. Then go to the next page ->
.
Provide some dummy URLs. They are not used but they must be valid URLs. Click on the tick to continue to create the application.
Under the Configure menu button take note of the client ID.9. Under the Keys section choose a 1 year duration (or 2) and click the Save button azToken the bottom of the screen. A key is generated which you should copy now and save it somewhere.
We also need the tenant ID. Click the VIEW ENDPOINTS
button on the bottom of the screen and find a list of endpoints all including the tenant ID as a sequence of hexadecimals.
Now we need to set up the applications permissions. Click the Add application
button. In the resulting window scroll to Windows Azure Service Management API
and select it. Then click the Tick icon.
Under the resulting “permissions to other applications” section, for the Windows Azure Service Management API entry, from the “Delegated Permissions:0” drop down tick the Access Azure Service Management as organization.
Click on the Save icon azToken the bottom of the window again.
Now we need to assign the application a role and to do so we go to the (new) Azure portal. https://portal.azure.com/
To apply access control azToken Resource Group
-Click on Resource Groups menu item on the left in the portal.
Identify the resource group you will associate with this application.
Choose the Users menu item from the Resource scope.
In the resulting scope click the + Add
button.
Choose the role as Owner and under the user search box enter the name of the App, e.g., AzureSMR.
Select the resulting list item for that App then click Select in that scope then OK in the “Add access” scope. The user will be added to the list.
Alternatively you can access control azToken Subscription Level
-Click on Subscriptions on the left menu item in the portal.
Identify the Subscription you will associate with this application.
Choose the Users(access) menu item.
In the resulting scope click the + Add button.
Choose the role as Owner and under the user search box enter the name of the App, e.g., AzureSMR.
Select the resulting list item for that App then click Select in that scope then OK in the “Add access” scope. The user will be added to the list.
That is all. You can test this by trying:
-sc <- createAzureContext(tenantID = "{TID}", clientID = "{CID}", authKey= "{KEY}")
-rgs <- AzureListRG(sc)
-rgs
For more information see the tutorial Use portal to create Active Directory application and service principal that can access resources
-Use this package to manage Azure Resources from within an R Session. This is not a full SDK just a collection of functions that should prove useful for a Data Scientist who needs to access and manage Azure Resources.
-Install the development version of the package directly from GitHub with:
-# Install devtools
-if(!require("devtools")) install.packages("devtools")
-devtools::install_github("Microsoft/AzureSMR")
-library(AzureSMR)
AzureSMR provides an interface to manage resources on Microsoft Azure. The main functions address the following Azure Services:
-For a detailed list of AzureSM functions and their syntax please refer to the Help pages.
-The AzureAPIs require lots of parameters to be managed. Rather than supplying all the paramters for each function call AzureSMR implements an AzureContext Variable which caches the last time a paramters is used so that it doesnt need to be repeatedly supplied.
-To create an AzureContext object and attempt to authenticate against the Azure service, use:
-sc <- createAzureContext(tenantID = "{TID}", clientID = "{CID}", authKey= "{KEY}")
-sc
To get an authorisation token use azureAuthenticate()
. Note this token will time our after a period and therefore you need to run it again occasionally. TIP: Use AzureAuthenticate before a long running task.
The azureListSubscriptions()
funtion lists all the available subscriptions. If you only have one it sets the default Subscription in the azureActiveContext
to that subscription ID.
azureListSubscriptions(sc)
# list resource groups
-AzureListRG(sc)
-
-# list all resources
-azureListAllResources(sc)
-
-azureListAllResources(sc, location = "northeurope")
-
-azureListAllResources(sc, type = "Microsoft.Sql/servers", location = "northeurope")
-
-azureListAllResources(sc, resourceGroup = "Analytics")
-
-azureCreateResourceGroup(sc, resourceGroup = "testme", location = "northeurope")
-
-azureDeleteResourceGroup(sc, resourceGroup = "testme")
-
-azureListRG(sc)$name
Use these functions to list, start and stop Virtual Machines (see templates for Creation).
-To Create VMs please refer to Resource Templates below.
-azureListVM(sc, resourceGroup = "AWHDIRG")
-
-## Name Location Type OS State Admin
-## 1 DSVM1 northeurope Microsoft.Compute/virtualMachines Linux Succeeded alanwe
-
-azureStartVM(sc, vmName = "DSVM1")
-azureStopVM(sc, vmName = "DSVM1")
In order to access Storage Blobs you need to have a key. Use azureSAGetKey()
to get a Key or alternatively supply your own key. When you provide your own key you no longer need to use azureAuthenticate()
since the API uses a diferent authentication approach.
sKey <- AzureSAGetKey(sc, resourceGroup = "Analytics", storageAccount = "analyticsfiles")
To list containers in a storage account use azureListContainers()
azListContainers(sc, storageAccount = "analyticsfiles", containers = "Test")
To list blobs in a container use azureListStorageBlobs()
azureListStorageBlobs(sc, storageAccount = "analyticsfiles", container = "test")
To Write a Blobs use azurePutBlob()
AzurePutBlob(sc, StorageAccount = "analyticsfiles", container = "test",
- contents = "Hello World",
- blob = "HELLO")
To read a blob in a container use azureGetBlob()
azureGetBlob(sc, storageAccount = "analyticsfiles", container = "test",
- blob="HELLO",
- type="text")
You can use AzureSMR
to manage Azure HDInsight clusters. To create clusters use Resource Templates (See below).
Also see functions for submitting Hive and Spark jobs.
-Use azureListHDI()
to list available Clusters.
azureListHDI(sc)
-azureListHDI(sc, resourceGroup ="Analytics")
Use azureResizeHDI()
to resize a cluster
azureResizeHDI(sc, resourceGroup = "Analytics", clusterName = "{HDIClusterName}",
- Role="workernode",Size=2)
-
-## AzureResizeHDI: Request Submitted: 2016-06-23 18:50:57
-## Resizing(R), Succeeded(S)
-## RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR
-## RRRRRRRRRRRRRRRRRRS
-## Finished Resizing Sucessfully: 2016-06-23 19:04:43
-## Finished: 2016-06-23 19:04:43
-## ## Information
-## " headnode ( 2 * Standard_D3_v2 ) workernode ( 5 * Standard_D3_v2 ) zookeepernode ( 3 * Medium ) edgenode0 ( 1 * Standard_D4_v2 )"
The easiest way to create resources on Azure is to use Azure Templates. To create Azure Resources such as HDInsight clusters there can a large quantity of parameters. Resource templates can be built be creating a resource in the Azure Portal and then going into Settings > Automation scripts. Example templates can be found azToken this URL https://github.com/Azure/AzureStack-QuickStart-Templates.
-To create a resource using a template in AzureSM use AzureDeployTemplate. The Template and Paramters must be available in a public URL (Azure Blob). It may be worth getting the Azure Administrator to build a working template.
-azureDeployTemplate(sc, resourceGroup = "Analytics", deplName = "Deploy1",
- templateURL = "{TEMPLATEURL}", paramURL = "{PARAMURL}")
-
-## AzureDeployTemplate: Request Submitted: 2016-06-23 18:50:57
-## Resizing(R), Succeeded(S)
-## RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR
-## RRRRRRRRRRRRRRRRRRS
-## Finished Deployed Sucessfully: 2016-06-23 19:04:43
-## Finished: 2016-06-23 19:04:43
ADMIN TIP: If a deployment fails. Go to the Azure Portal and look azToken Actvity logs and look for failed deployments which should explain why the deployment failed.
-These functions facilitate the use of hive jobs on a HDInsight Cluster
-azureHiveStatus(sc, clusterName = "{hdicluster}",
- hdiAdmin = "admin",
- hdiPassword = "********")
-AzureHiveSQL(sc,
- CMD = "select * from airports",
- Path = "wasb://{container}@{hdicluster}.blob.core.windows.net/")
-
-stdout <- AzureGetBlob(sc, Container = "test", Blob = "stdout")
-
-read.delim(text=stdout, header=TRUE, fill=TRUE)
AzureSMR
provides some functions that allow HDInsight Spark Sessions and jobs to be managed within an R Session
To Create a new Spark Session (Via Livy) use azureSparkNewSession()
azureSparkNewSession(sc, clusterName = "{hdicluster}",
- hdiAdmin = "admin",
- hdiPassword = "********",
- kind = "pyspark")
To view the status of sessions use AzureSparkListSessions
-azureSparkListSessions(sc, clusterName = "{hdicluster}")
To send a command to the Spark Session use azureSparkCMD()
. In this case it submits a Python routine
# SAMPLE PYSPARK SCRIPT TO CALCULATE PI
-pythonCmd <- '
-from pyspark import SparkContext
-from operator import add
-import sys
-from random import random
-partitions = 1
-n = 20000000 * partitions
-def f(_):
- x = random() * 2 - 1
- y = random() * 2 - 1
- return 1 if x ** 2 + y ** 2 < 1 else 0
-
-count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
-Pi = (4.0 * count / n)
-print("Pi is roughly %f" % Pi)'
-
-azureSparkCMD(sc, cmd = pythonCmd, sessionID = "5")
-
-## [1] "Pi is roughly 3.140285"
Check Session variables are retained
-azureSparkCMD(sc, clusterName = "{hdicluster}", cmd = "print Pi", sessionID="5")
-
-#[1] "3.1422"