You could run following PowerShell script to scan all files from all document libraries in a site and extract the File Name, File Hash, and Size parameters for comparison to output a CSV report with all duplicate files.
#Load SharePoint CSOM Assemblies
Add-Type -Path "C:\Program Files\Common Files\Microsoft Shared\Web Server Extensions\16\ISAPI\Microsoft.SharePoint.Client.dll"
Add-Type -Path "C:\Program Files\Common Files\Microsoft Shared\Web Server Extensions\16\ISAPI\Microsoft.SharePoint.Client.Runtime.dll"
#Parameters
$SiteURL = "https://tenant.sharepoint.com/sites/emilytest"
$CSVPath = "C:\Duplicates.csv"
$BatchSize = 2000
#Array for Result Data
$DataCollection = @()
#Get credentials to connect
$Cred = Get-Credential
Try {
#Setup the Context
$Ctx = New-Object Microsoft.SharePoint.Client.ClientContext($SiteURL)
$Ctx.Credentials = New-Object Microsoft.SharePoint.Client.SharePointOnlineCredentials($Cred.UserName, $Cred.Password)
#Get the Web
$Web = $Ctx.Web
$Lists = $Web.Lists
$Ctx.Load($Web)
$Ctx.Load($Lists)
$Ctx.ExecuteQuery()
#Iterate through Each List on the web
ForEach($List in $Lists)
{
#Filter Lists
If($List.BaseType -eq "DocumentLibrary" -and $List.Hidden -eq $False -and $List.ItemCount -gt 0 -and $List.Title -Notin("Site Pages","Style Library", "Preservation Hold Library"))
{
#Define CAML Query to get Files from the list in batches
$Query = New-Object Microsoft.SharePoint.Client.CamlQuery
$Query.ViewXml = "@
<View Scope='RecursiveAll'>
<Query>
<OrderBy><FieldRef Name='ID' Ascending='TRUE'/></OrderBy>
</Query>
<RowLimit Paged='TRUE'>$BatchSize</RowLimit>
</View>"
$Counter = 1
#Get Files from the Library in Batches
Do {
$ListItems = $List.GetItems($Query)
$Ctx.Load($ListItems)
$Ctx.ExecuteQuery()
ForEach($Item in $ListItems)
{
#Fiter Files
If($Item.FileSystemObjectType -eq "File")
{
#Get the File from Item
$File = $Item.File
$Ctx.Load($File)
$Ctx.ExecuteQuery()
Write-Progress -PercentComplete ($Counter / $List.ItemCount * 100) -Activity "Processing File $Counter of $($List.ItemCount) in $($List.Title) of $($Web.URL)" -Status "Scanning File '$($File.Name)'"
#Get The File Hash
$Bytes = $File.OpenBinaryStream()
$Ctx.ExecuteQuery()
$MD5 = New-Object -TypeName System.Security.Cryptography.MD5CryptoServiceProvider
$HashCode = [System.BitConverter]::ToString($MD5.ComputeHash($Bytes.Value))
#Collect data
$Data = New-Object PSObject
$Data | Add-Member -MemberType NoteProperty -name "FileName" -value $File.Name
$Data | Add-Member -MemberType NoteProperty -Name "HashCode" -value $HashCode
$Data | Add-Member -MemberType NoteProperty -Name "URL" -value $File.ServerRelativeUrl
$Data | Add-Member -MemberType NoteProperty -Name "FileSize" -value $File.Length
$DataCollection += $Data
}
$Counter++
}
#Update Postion of the ListItemCollectionPosition
$Query.ListItemCollectionPosition = $ListItems.ListItemCollectionPosition
}While($Query.ListItemCollectionPosition -ne $null)
}
}
#Export All Data to CSV
$DataCollection | Export-Csv -Path $CSVPath -NoTypeInformation
Write-host -f Green "Files Inventory has been Exported to $CSVPath"
#Get Duplicate Files by Grouping Hash code
$Duplicates = $DataCollection | Group-Object -Property HashCode | Where {$_.Count -gt 1} | Select -ExpandProperty Group
Write-host "Duplicate Files Based on File Hashcode:"
$Duplicates | Format-table -AutoSize
#Group Based on File Name
$FileNameDuplicates = $DataCollection | Group-Object -Property FileName | Where {$_.Count -gt 1} | Select -ExpandProperty Group
Write-host "Potential Duplicate Based on File Name:"
$FileNameDuplicates| Format-table -AutoSize
#Group Based on File Size
$FileSizeDuplicates = $DataCollection | Group-Object -Property FileSize | Where {$_.Count -gt 1} | Select -ExpandProperty Group
Write-host "Potential Duplicates Based on File Size:"
$FileSizeDuplicates| Format-table -AutoSize
}
Catch {
write-host -f Red "Error:" $_.Exception.Message
}
If the answer is helpful, please click "Accept Answer" and kindly upvote it. If you have extra questions about this answer, please click "Comment".
Note: Please follow the steps in our documentation to enable e-mail notifications if you want to receive the related email notification for this thread.