Use case : Is there possibility to check all notebooks from one particular Databricks workspace and get an inventory of where notebooks are directly accessing data through ADLS locations rather than through the metastore.
Information to capture:
Notebook names where “abfss://” is referenced
The specific file path(s) referenced in each notebook
Please help me on this if possible.
This functionality isn't available on Azure Databricks - you can search by notebook or folder names only.  But you can still do what you want by exporting notebooks to local disk & search for the string.  For exporting you can use Databricks CLI's workspace export_dir command, like this:
databricks workspace export_dir '/Shared/' ~/tmp/databricks-files
                        Finally we found a right solutions.Even through, workspace export_dir  command is one of the solution for this use case , we have to download all notebooks from workspace to local which is not
recommended by our security team . So alternatively , we have achieved through 2.0/workspace/export  rest api options in databricks notebooks itself.
Steps :
Reference: https://docs.databricks.com/dev-tools/api/latest/workspace.html#export
Complete code for reference:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json
databricks_instance ="https://databricks_instance.azuredatabricks.net"
url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"
payload = json.dumps({
  "path": "/Users/"
})
headers = {
  'Authorization': 'Bearer user_token',
  'Content-Type': 'application/json'
}
response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []
# Getting the all notebooks list for given notebooks.
def list_notebooks(mylist):
  for element in mylist['objects']:
    if element['object_type'] == 'NOTEBOOK':
      notebooks.append(element)
    if element['object_type'] == 'DIRECTORY':
      payload_inner = json.dumps({
        "path": element['path']
      })
      response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
      if len(response_inner) != 0:
        list_notebooks(response_inner)
  return notebooks
result = list_notebooks(response)
print(result[0])
#     print(current_note_book_id)
#     
class BearerAuth(requests.auth.AuthBase):
      def __init__(self, token):
          self.token = token
      def __call__(self, r):
          r.headers["authorization"] = 'Bearer user_token'
          
          return r
# Define the function to get the permission 
def get_permission(var_object_id):
    url_export1 = f"{databricks_instance}/api/2.0/preview/permissions/notebooks/{var_object_id}"
    response = requests.request("GET", url_export1, headers=headers, data=payload).json()
    access_control_list= response['access_control_list']
    return access_control_list
    
# Define the function to get the user and permission of the notebooks 
def get_control_user_list(access_control_list):
    access_control_user_list =[]
    for var_idx in access_control_list:
        if "user_name" in var_idx:
           var_user_name = var_idx['user_name']
        if "group_name" in var_idx:
           var_user_name = var_idx['group_name']
        var_user_permission = var_idx['all_permissions'][0]['permission_level']
        access_control_user_list.append(var_user_name)
        access_control_user_list.append(var_user_permission)
    return access_control_user_list
  
#get the current notebook path
var_current_notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
# print(var_current_notebook_path)
# Processing the every elment of the notebook objects for given location 
var_search_type_oracle ='Oracle'
# Defining the search pattern 
var_oracle_search_pattern = 'jdbc:oracle'
#mentioning the target location 
var_report_target_location = '/writing/into/target/location'
# Define the function to formatting the final results
def get_format_result(var_notebook_path,var_search_type,var_access_user_control_list):
    var_access_user_control_list_str = '-'.join(var_access_user_control_list)
    final_format_record = var_notebook_path + "|" + var_search_type + "|" + var_access_user_control_list_str
    return final_format_record
# Opening the file for report 
f = open(f"{var_report_target_location}/mytarget", 'w')
# Processing the every note book objects 
var_total_notebook_count_processed =0
var_total_notebook_count_oracle    =0
var_total_notebook_count_teradata  =0
var_total_notebook_count_datalakeservice  =0
var_total_notebook_count_oauth  =0
for notebook_items in result : 
    var_notebook_path = notebook_items["path"]
    var_object_id     = notebook_items["object_id"]
    var_note_payload  =  json.dumps({"path": f"{var_notebook_path}"})
    var_response = requests.request("GET", url_export, headers=headers, data=var_note_payload).json()
#     if "content" in var_response:
#         var_response_content=var_response['content']
#     else:
#         print ("this is not notbook cotain content = " % (var_notebook_path))
#         var_response_content = base64.b64encode(b'no data to be encoded')
    try:
        var_response_content=var_response['content']
    except:
        # Continue to next iteration as some time note book has some limitation {'error': 'DatabricksServiceException: BAD_REQUEST: content size (15395930) exceeded the limit 10485760'}
        continue
#     var_response_content_str= base64.b64decode(var_response['content']).decode("utf-8") 
    var_response_content_str= base64.b64decode(var_response_content).decode("utf-8") 
    var_total_notebook_count_processed +=1
    if var_response_content_str.find(f"{var_other_onprem_search_pattern}") != -1 and var_response_content_str.find("password") != -1 and var_notebook_path !=var_current_notebook_path :
       # oracle connection password contain note books 
       if var_response_content_str.find(f"{var_oracle_search_pattern}") != -1 and var_response_content_str.find("password") != -1 and var_notebook_path !=var_current_notebook_path :
          print(var_notebook_path)
          var_access_control_list      = get_permission(var_object_id)
          var_access_user_control_list = get_control_user_list(var_access_control_list)
          var_final_format_record      = get_format_result(var_notebook_path,var_search_type_oracle,var_access_user_control_list)
          print(var_final_format_record)
          f.write(var_final_format_record +'\n')
          var_total_notebook_count_oracle +=1
                        If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With