Hi @leon,
This PowerShell script scans all files from all document libraries in a site and extracts the File Name, File Hash, and Size parameters for comparison to output a CSV report with all data.
#Load SharePoint CSOM Assemblies
Add-Type -Path "C:\Program Files\Common Files\Microsoft Shared\Web Server Extensions\16\ISAPI\Microsoft.SharePoint.Client.dll"
Add-Type -Path "C:\Program Files\Common Files\Microsoft Shared\Web Server Extensions\16\ISAPI\Microsoft.SharePoint.Client.Runtime.dll"
#Parameters
$SiteURL = "https://mycompany.sharepoint.com/sites/marketing"
$CSVPath = "C:\Temp\Duplicates.csv"
$BatchSize = 2000
#Array for Result Data
$DataCollection = @()
#Get credentials to connect
$Cred = Get-Credential
Try {
#Setup the Context
$Ctx = New-Object Microsoft.SharePoint.Client.ClientContext($SiteURL)
$Ctx.Credentials = New-Object Microsoft.SharePoint.Client.SharePointOnlineCredentials($Cred.UserName, $Cred.Password)
#Get the Web
$Web = $Ctx.Web
$Lists = $Web.Lists
$Ctx.Load($Web)
$Ctx.Load($Lists)
$Ctx.ExecuteQuery()
#Iterate through Each List on the web
ForEach($List in $Lists)
{
#Filter Lists
If($List.BaseType -eq "DocumentLibrary" -and $List.Hidden -eq $False -and $List.ItemCount -gt 0 -and $List.Title -Notin("Site Pages","Style Library", "Preservation Hold Library"))
{
#Define CAML Query to get Files from the list in batches
$Query = New-Object Microsoft.SharePoint.Client.CamlQuery
$Query.ViewXml = "@
<View Scope='RecursiveAll'>
<Query>
<OrderBy><FieldRef Name='ID' Ascending='TRUE'/></OrderBy>
</Query>
<RowLimit Paged='TRUE'>$BatchSize</RowLimit>
</View>"
$Counter = 1
#Get Files from the Library in Batches
Do {
$ListItems = $List.GetItems($Query)
$Ctx.Load($ListItems)
$Ctx.ExecuteQuery()
ForEach($Item in $ListItems)
{
#Fiter Files
If($Item.FileSystemObjectType -eq "File")
{
#Get the File from Item
$File = $Item.File
$Ctx.Load($File)
$Ctx.ExecuteQuery()
Write-Progress -PercentComplete ($Counter / $List.ItemCount * 100) -Activity "Processing File $Counter of $($List.ItemCount) in $($List.Title) of $($Web.URL)" -Status "Scanning File '$($File.Name)'"
#Get The File Hash
$Bytes = $File.OpenBinaryStream()
$Ctx.ExecuteQuery()
$MD5 = New-Object -TypeName System.Security.Cryptography.MD5CryptoServiceProvider
$HashCode = [System.BitConverter]::ToString($MD5.ComputeHash($Bytes.Value))
#Collect data
$Data = New-Object PSObject
$Data | Add-Member -MemberType NoteProperty -name "FileName" -value $File.Name
$Data | Add-Member -MemberType NoteProperty -Name "HashCode" -value $HashCode
$Data | Add-Member -MemberType NoteProperty -Name "URL" -value $File.ServerRelativeUrl
$Data | Add-Member -MemberType NoteProperty -Name "FileSize" -value $File.Length
$DataCollection += $Data
}
$Counter++
}
#Update Postion of the ListItemCollectionPosition
$Query.ListItemCollectionPosition = $ListItems.ListItemCollectionPosition
}While($Query.ListItemCollectionPosition -ne $null)
}
}
#Export All Data to CSV
$DataCollection | Export-Csv -Path $CSVPath -NoTypeInformation
Write-host -f Green "Files Inventory has been Exported to $CSVPath"
#Get Duplicate Files by Grouping Hash code
$Duplicates = $DataCollection | Group-Object -Property HashCode | Where {$_.Count -gt 1} | Select -ExpandProperty Group
Write-host "Duplicate Files Based on File Hashcode:"
$Duplicates | Format-table -AutoSize
#Group Based on File Name
$FileNameDuplicates = $DataCollection | Group-Object -Property FileName | Where {$_.Count -gt 1} | Select -ExpandProperty Group
Write-host "Potential Duplicate Based on File Name:"
$FileNameDuplicates| Format-table -AutoSize
#Group Based on File Size
$FileSizeDuplicates = $DataCollection | Group-Object -Property FileSize | Where {$_.Count -gt 1} | Select -ExpandProperty Group
Write-host "Potential Duplicates Based on File Size:"
$FileSizeDuplicates| Format-table -AutoSize
}
Catch {
write-host -f Red "Error:" $_.Exception.Message
}
Then you can bulk delete documents from the CSV using PowerShell.
#Parameters
$SiteURL = "https://mycompany.sharepoint.com/sites/marketing"
$CSVFilePath = "C:\Temp\Duplicates.csv"
#Get Credentials to connect
$Cred = Get-Credential
Try {
#Connect to PnP Online
Connect-PnPOnline -Url $SiteURL -Credentials $Cred
#Read from CSV file and delete
Import-CSV $CSVFilePath | ForEach-Object {
#Check if File exists
$File = Get-PnPFile -ServerRelativeUrl $_.ServerRelativeUrl -ErrorAction SilentlyContinue
If($File)
{
#Delete the File - Send to Recycle bin without prompting
Remove-PnPFile -ServerRelativeUrl $_.ServerRelativeURL -Recycle -Force
Write-Host "Deleted File '$($_.FileName)' at $($_.ServerRelativeUrl)" -ForegroundColor Green
}
Else
{
Write-Host "File '$($_.FileName)' doesn't exists at $($_.ServerRelativeUrl)" -ForegroundColor Yellow
}
}
}
catch {
write-host "Error: $($_.Exception.Message)" -foregroundcolor Red
}
If the answer is helpful, please click "Accept Answer" and kindly upvote it. If you have extra questions about this answer, please click "Comment".
Note: Please follow the steps in our documentation to enable e-mail notifications if you want to receive the related email notification for this thread.