Customers are increasingly storing vast amounts of data in Oracle Cloud Infrastructure (OCI) Object Storage due to its durability, performance, high availability and built-in security features. However, as the volume of data grows over time, more sensitive information like personal identifiers like Name, Address, Phone Numbers and authentication credentials like API keys, Auth Token, resource Identifiers are being added to Object Storage.
While Object Storage is secure, identifying which files contain sensitive data and applying additional layers of protection, such as stronger access policies on who can view that data, is crucial for safeguarding this information and ensuring regulatory compliance.
In this blog post, we will explore how to detect sensitive information within files stored in Object Storage Buckets using OCI’s Document Understanding AI service. This solution enables organizations to implement effective access controls and maintain compliance with data protection regulations.
Why Identifying Sensitive Data Matters
- Regulatory Compliance
- Data Breach Prevention
- Data Minimization
- Optimizing Access Control
Automating the Process: Scanning Files for Sensitive Data
As organizations accumulate vast amounts of unstructured and semi-strucutured data, manually reviewing every file for sensitive information becomes impractical. To address this challenge, leveraging tools like OCI AI Language for detecting Personally Identifiable Information (PII) can simplify the process.
The Python script below demonstrates how to automate the identification of sensitive data within files stored in OCI Object Storage.
import oci
import json
import csv
import xml.etree.ElementTree as ET
from PyPDF2 import PdfReader
from io import BytesIO, StringIO
config = oci.config.from_file()
object_storage_client = oci.object_storage.ObjectStorageClient(config)
ai_language_client = oci.ai_language.AIServiceLanguageClient(config)
bucket_name = "Sensitive_Bucket"
namespace = "namespace"
compartment_id = "ocid1.compartment.oc1..aaaaaaa"
bucket_tag_key = 'data_sensitivity_level'
bucket_tag_value = 'High'
output_file = 'pii_scan_results.json'
objects = object_storage_client.list_objects(namespace, bucket_name).data.objects
output_data = []
sensitive_files_found = False
def analyze_text_for_pii(text):
response = ai_language_client.batch_detect_language_pii_entities(
batch_detect_language_pii_entities_details=oci.ai_language.models.BatchDetectLanguagePiiEntitiesDetails(
documents=[oci.ai_language.models.TextDocument(
key="document_key",
text=text,
language_code="en")],
compartment_id=compartment_id
)
)
return response
def tag_bucket(sensitivity_level):
update_bucket_response = object_storage_client.update_bucket(
namespace_name=namespace,
bucket_name=bucket_name,
update_bucket_details=oci.object_storage.models.UpdateBucketDetails(
freeform_tags={bucket_tag_key: bucket_tag_value}
)
)
print(f"Bucket {bucket_name} tagged with {bucket_tag_key}: {sensitivity_level}")
def extract_text_from_json(content):
try:
data = json.loads(content)
return json.dumps(data, indent=4)
except json.JSONDecodeError:
return ""
def extract_text_from_xml(content):
try:
tree = ET.ElementTree(ET.fromstring(content))
root = tree.getroot()
return ET.tostring(root, encoding='unicode', method='text')
except ET.ParseError:
return ""
def extract_text_from_pdf(content):
try:
pdf_stream = BytesIO(content)
reader = PdfReader(pdf_stream)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error extracting PDF content: {e}")
return ""
def extract_text_from_csv(content):
try:
csv_stream = StringIO(content.decode('utf-8'))
csv_reader = csv.reader(csv_stream)
csv_text = ""
for row in csv_reader:
csv_text += ", ".join(row) + "\n"
return csv_text
except Exception as e:
print(f"Error extracting CSV content: {e}")
return ""
def read_object_content(object_name, file_type):
object_data = object_storage_client.get_object(namespace, bucket_name, object_name).data
content = object_data.content
if file_type == 'txt':
return content.decode('utf-8')
elif file_type == 'json':
return extract_text_from_json(content.decode('utf-8'))
elif file_type == 'xml':
return extract_text_from_xml(content.decode('utf-8'))
elif file_type == 'pdf':
return extract_text_from_pdf(content)
elif file_type == 'csv':
return extract_text_from_csv(content)
return None
allowed_file_types = ['txt', 'json', 'xml', 'pdf', 'csv']
for obj in objects:
object_name = obj.name
file_extension = object_name.split('.')[-1].lower()
if file_extension in allowed_file_types:
print(f"Analyzing document: {object_name}")
try:
file_content = read_object_content(object_name, file_extension)
if file_content:
response = analyze_text_for_pii(file_content)
pii_entities = []
for document in response.data.documents:
for entity in document.entities:
pii_entities.append({
"entity_type": entity.type,
"text": entity.text
})
if pii_entities:
sensitive_files_found = True
output_data.append({
"bucket_name": bucket_name,
"object_name": object_name,
"pii_entities": pii_entities
})
except oci.exceptions.ServiceError as e:
print(f"Error analyzing document {object_name}: {e}")
if sensitive_files_found:
tag_bucket(bucket_tag_value)
with open(output_file, 'w') as json_file:
json.dump(output_data, json_file, indent=4)
print(f"PII scan completed. Results written to {output_file}")
In the script, update
- bucket_name: Name of the bucket to scan for sensitive data.
- namespace: Namespace of the bucket.
- compartment_id: Compartment ID where the Document Understanding service will run.
- bucket_tag_key and bucket_tag_value: Key-value pair used to tag buckets when sensitive data is identified.
- output_file: Path to the file where detailed information about identified sensitive data will be saved.
This solution supports multiple file formats, including TXT, CSV, JSON, XML, PDF and automatically scans for sensitive data types such as names, emails, credit card numbers and more. Refer to this page for more information on Fields that are supported by Document Understanding service.
Once the script is executed, any identified sensitive data will result in the bucket being tagged with the specified key-value pair. Additionally, a file (output_file) will be created, containing detailed information about the sensitive data found.
Sample response in Output_File
[
{
"bucket_name": "Sensitive_Bucket",
"object_name": "all_types_of_pii.txt",
"pii_entities": [
{
"entity_type": "PERSON",
"text": "John Doe"
},
{
"entity_type": "ADDRESS",
"text": "1234 Elm Street, Springfield, IL 62704"
},
{
"entity_type": "AGE",
"text": "35"
}
]
},
{
"bucket_name": "Sensitive_Bucket",
"object_name": "name.pdf",
"pii_entities": [
{
"entity_type": "PERSON",
"text": "John Doe"
},
{
"entity_type": "EMAIL",
"text": "john.doe@example.com"
},
{
"entity_type": "TELEPHONE_NUMBER",
"text": "123 -456-7890\""
}
]
},
{
"bucket_name": "Sensitive_Bucket",
"object_name": "pii.txt",
"pii_entities": [
{
"entity_type": "IP_ADDRESS",
"text": "192.168.1.1"
},
{
"entity_type": "MAC_ADDRESS",
"text": "00:14:22:01:23:45"
},
{
"entity_type": "COOKIE",
"text": "COOKIE: sessionid=38afes7a8; Path=/; Secure; HttpOnly\r\n"
},
{
"entity_type": "JSON_WEB_TOKEN",
"text": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c"
}
]
},
{
"bucket_name": "Sensitive_Bucket",
"object_name": "sample_pii_data.csv",
"pii_entities": [
{
"entity_type": "OCI_OCID_REFERENCE",
"text": "ocid1.user.oc1..aaaaaaaa5fhegy2c7z5g4isokvrljafadk6jdslj5xdvkhzjlf77h4jrwima"
},
{
"entity_type": "EMAIL",
"text": "ocid1.user.oc1..aaaaa@oracle.com"
},
{
"entity_type": "IP_ADDRESS",
"text": "6a:32:17:ff:6f:3c:82:aa\r"
},
{
"entity_type": "OCI_STORAGE_SIGNED_URL",
"text": "https://objectstorage.us-phoenix-1.oraclecloud.com/n/namespace-string/b/bucketname/o/objectname?X-Amz-Signature=d5a1a6d5e1bfa6b7a38d\r"
}
]
},
{
"bucket_name": "Sensitive_Bucket",
"object_name": "sample_pii_data.json",
"pii_entities": [
{
"entity_type": "PERSON",
"text": "John Doe"
},
{
"entity_type": "EMAIL",
"text": "john.doe@example.com"
}
]
}
]
By integrating automated PII detection and reporting into your data management strategy, you can ensure that sensitive data is properly safeguarded, minimizing risks and maintaining compliance in an evolving digital landscape.
