Identifying Sensitive Data in Object Storage Files

Customers are increasingly storing vast amounts of data in Oracle Cloud Infrastructure (OCI) Object Storage due to its durability, performance, high availability and built-in security features. However, as the volume of data grows over time, more sensitive information like personal identifiers like Name, Address, Phone Numbers and authentication credentials like API keys, Auth Token, resource Identifiers are being added to Object Storage.

While Object Storage is secure, identifying which files contain sensitive data and applying additional layers of protection, such as stronger access policies on who can view that data, is crucial for safeguarding this information and ensuring regulatory compliance.

In this blog post, we will explore how to detect sensitive information within files stored in Object Storage Buckets using OCI’s Document Understanding AI service. This solution enables organizations to implement effective access controls and maintain compliance with data protection regulations.

Why Identifying Sensitive Data Matters

Regulatory Compliance
Data Breach Prevention
Data Minimization
Optimizing Access Control

Automating the Process: Scanning Files for Sensitive Data

As organizations accumulate vast amounts of unstructured and semi-strucutured data, manually reviewing every file for sensitive information becomes impractical. To address this challenge, leveraging tools like OCI AI Language for detecting Personally Identifiable Information (PII) can simplify the process.

The Python script below demonstrates how to automate the identification of sensitive data within files stored in OCI Object Storage.

import oci
import json
import csv
import xml.etree.ElementTree as ET
from PyPDF2 import PdfReader  
from io import BytesIO, StringIO 

config = oci.config.from_file()

object_storage_client = oci.object_storage.ObjectStorageClient(config)
ai_language_client = oci.ai_language.AIServiceLanguageClient(config)

bucket_name = "Sensitive_Bucket"
namespace = "namespace"
compartment_id = "ocid1.compartment.oc1..aaaaaaa"

bucket_tag_key = 'data_sensitivity_level'
bucket_tag_value = 'High'  
output_file = 'pii_scan_results.json'

objects = object_storage_client.list_objects(namespace, bucket_name).data.objects

output_data = []
sensitive_files_found = False  

def analyze_text_for_pii(text):
    response = ai_language_client.batch_detect_language_pii_entities(
        batch_detect_language_pii_entities_details=oci.ai_language.models.BatchDetectLanguagePiiEntitiesDetails(
            documents=[oci.ai_language.models.TextDocument(
                key="document_key",
                text=text,
                language_code="en")],
            compartment_id=compartment_id
        )
    )
    return response

def tag_bucket(sensitivity_level):
    update_bucket_response = object_storage_client.update_bucket(
        namespace_name=namespace,
        bucket_name=bucket_name,
        update_bucket_details=oci.object_storage.models.UpdateBucketDetails(
            freeform_tags={bucket_tag_key: bucket_tag_value}
        )
    )
    print(f"Bucket {bucket_name} tagged with {bucket_tag_key}: {sensitivity_level}")

def extract_text_from_json(content):
    try:
        data = json.loads(content)
        return json.dumps(data, indent=4)  
    except json.JSONDecodeError:
        return ""

def extract_text_from_xml(content):
    try:
        tree = ET.ElementTree(ET.fromstring(content))
        root = tree.getroot()
        return ET.tostring(root, encoding='unicode', method='text')  
    except ET.ParseError:
        return ""

def extract_text_from_pdf(content):
    try:        
        pdf_stream = BytesIO(content)
        reader = PdfReader(pdf_stream)
        text = ""
        for page in reader.pages:
            text += page.extract_text() 
        return text
    except Exception as e:
        print(f"Error extracting PDF content: {e}")
        return ""

def extract_text_from_csv(content):
    try:
        csv_stream = StringIO(content.decode('utf-8'))
        csv_reader = csv.reader(csv_stream)
        csv_text = ""
        for row in csv_reader:
            csv_text += ", ".join(row) + "\n"  
        return csv_text
    except Exception as e:
        print(f"Error extracting CSV content: {e}")
        return ""
    
def read_object_content(object_name, file_type):
    object_data = object_storage_client.get_object(namespace, bucket_name, object_name).data
    content = object_data.content
    if file_type == 'txt':
        return content.decode('utf-8')  
    elif file_type == 'json':
        return extract_text_from_json(content.decode('utf-8')) 
    elif file_type == 'xml':
        return extract_text_from_xml(content.decode('utf-8')) 
    elif file_type == 'pdf':
        return extract_text_from_pdf(content) 
    elif file_type == 'csv':
        return extract_text_from_csv(content) 
    return None

allowed_file_types = ['txt', 'json', 'xml', 'pdf', 'csv']  

for obj in objects:
    object_name = obj.name
    file_extension = object_name.split('.')[-1].lower()        
    if file_extension in allowed_file_types:
        print(f"Analyzing document: {object_name}")        
        try:
            file_content = read_object_content(object_name, file_extension)
            if file_content:
                response = analyze_text_for_pii(file_content)                
                pii_entities = []
                for document in response.data.documents:
                    for entity in document.entities:
                        pii_entities.append({
                            "entity_type": entity.type,
                            "text": entity.text
                        })                
                if pii_entities:
                    sensitive_files_found = True  
                    output_data.append({
                        "bucket_name": bucket_name,
                        "object_name": object_name,
                        "pii_entities": pii_entities
                    })        
        except oci.exceptions.ServiceError as e:
            print(f"Error analyzing document {object_name}: {e}")
if sensitive_files_found:
    tag_bucket(bucket_tag_value)

with open(output_file, 'w') as json_file:
    json.dump(output_data, json_file, indent=4)

print(f"PII scan completed. Results written to {output_file}")

In the script, update

bucket_name: Name of the bucket to scan for sensitive data.
namespace: Namespace of the bucket.
compartment_id: Compartment ID where the Document Understanding service will run.
bucket_tag_key and bucket_tag_value: Key-value pair used to tag buckets when sensitive data is identified.
output_file: Path to the file where detailed information about identified sensitive data will be saved.

This solution supports multiple file formats, including TXT, CSV, JSON, XML, PDF and automatically scans for sensitive data types such as names, emails, credit card numbers and more. Refer to this page for more information on Fields that are supported by Document Understanding service.

Once the script is executed, any identified sensitive data will result in the bucket being tagged with the specified key-value pair. Additionally, a file (output_file) will be created, containing detailed information about the sensitive data found.

Sample response in Output_File

[
    {
        "bucket_name": "Sensitive_Bucket",
        "object_name": "all_types_of_pii.txt",
        "pii_entities": [
            {
                "entity_type": "PERSON",
                "text": "John Doe"
            },
            {
                "entity_type": "ADDRESS",
                "text": "1234 Elm Street, Springfield, IL 62704"
            },
            {
                "entity_type": "AGE",
                "text": "35"
            }
        ]
    },
    {
        "bucket_name": "Sensitive_Bucket",
        "object_name": "name.pdf",
        "pii_entities": [
            {
                "entity_type": "PERSON",
                "text": "John Doe"
            },
            {
                "entity_type": "EMAIL",
                "text": "john.doe@example.com"
            },
            {
                "entity_type": "TELEPHONE_NUMBER",
                "text": "123 -456-7890\""
            }
        ]
    },
    {
        "bucket_name": "Sensitive_Bucket",
        "object_name": "pii.txt",
        "pii_entities": [
            {
                "entity_type": "IP_ADDRESS",
                "text": "192.168.1.1"
            },
            {
                "entity_type": "MAC_ADDRESS",
                "text": "00:14:22:01:23:45"
            },
            {
                "entity_type": "COOKIE",
                "text": "COOKIE: sessionid=38afes7a8; Path=/; Secure; HttpOnly\r\n"
            },
            {
                "entity_type": "JSON_WEB_TOKEN",
                "text": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c"
            }
        ]
    },
    {
        "bucket_name": "Sensitive_Bucket",
        "object_name": "sample_pii_data.csv",
        "pii_entities": [
            {
                "entity_type": "OCI_OCID_REFERENCE",
                "text": "ocid1.user.oc1..aaaaaaaa5fhegy2c7z5g4isokvrljafadk6jdslj5xdvkhzjlf77h4jrwima"
            },
            {
                "entity_type": "EMAIL",
                "text": "ocid1.user.oc1..aaaaa@oracle.com"
            },
            {
                "entity_type": "IP_ADDRESS",
                "text": "6a:32:17:ff:6f:3c:82:aa\r"
            },
            {
                "entity_type": "OCI_STORAGE_SIGNED_URL",
                "text": "https://objectstorage.us-phoenix-1.oraclecloud.com/n/namespace-string/b/bucketname/o/objectname?X-Amz-Signature=d5a1a6d5e1bfa6b7a38d\r"
            }
        ]
    },
    {
        "bucket_name": "Sensitive_Bucket",
        "object_name": "sample_pii_data.json",
        "pii_entities": [
            {
                "entity_type": "PERSON",
                "text": "John Doe"
            },
            {
                "entity_type": "EMAIL",
                "text": "john.doe@example.com"
            }
        ]
    }
]

By integrating automated PII detection and reporting into your data management strategy, you can ensure that sensitive data is properly safeguarded, minimizing risks and maintaining compliance in an evolving digital landscape.

Identifying Sensitive Data in Object Storage Files

Ramesh Balajepalli

Master Cloud Architect

How to add SSO login option on CPQ login page

Integrate Oracle Cloud Alarms with Splunk and ServiceNow

Identifying Sensitive Data in Object Storage Files

Authors

Ramesh Balajepalli

Master Cloud Architect

How to add SSO login option on CPQ login page

Integrate Oracle Cloud Alarms with Splunk and ServiceNow