Building custom solutions that extend, automate, and integrate Microsoft 365 apps.
After installing the NuGet package SevenZipExtractor (https://www.nuget.org/packages/SevenZipExtractor) move the 64-bit 7z.dll and the SevenZipExtractor.dll into a directory (the one holding the script will work) and change the directory name in the Add-Type cmdlet.
Change the $initialsize variable to a value slightly larger than maximum number of Item_IDs you have in your list. The value won't affect the size of the has but it will allow you to sacrifice memory for speed. If memory's a real problem, just set it 256 and let the hash figure out the bucket size and expand the has as needed.
I don't have any of the XML files you'll be looking at so I guessed that the "field" entitys' "name" value was "Item_ID". XML is CASE SENSITIVE so be careful!
The code will only look at files in the TAR files that have an extension ".XML". Beyond reading the TAR all the rest is done in memory.
$folderPath = "c:\junk"
$IdListPath = "c:\junk\ids.txt"
# Adjust this to use the directory containing the DLL
# https://www.nuget.org/packages/SevenZipExtractor
Add-Type -Path "C:\junk\SevenZipExtractor.dll"
$initialsize = 50 # this has an effect on lookup speed and memory consumption
# refer to the REMARKS section:
# https://learn.microsoft.com/en-us/dotnet/api/system.collections.hashtable.-ctor?view=netframework-4.8.1#System_Collections_Hashtable__ctor
# I tried using a value or 20,000,000 and it works with a 10-digit key and a 1-byte value
# Load hash from .txt to ArrayList ==> This no longer needs to be sorted!
# create case-insensitive hash
$Ids = [System.Collections.Specialized.CollectionsUtil]::CreateCaseInsensitiveHashtable($initialsize)
try {
$stream = [System.IO.StreamReader]::new($IdListPath)
while ($line = $stream.ReadLine()) {
[void]$Ids.Add($line, [byte]0)
}
}
catch{
$_
return
}
finally {
$stream.Dispose()
}
$stopwatch = [system.diagnostics.stopwatch]::StartNew()
# Get all TAR file names/ full pathes from directory/ subdirectories.
[System.IO.Directory]::EnumerateFiles($folderPath, '*.TAR', [System.IO.SearchOption]::AllDirectories) |
ForEach-Object{
$extractor = New-Object SevenZipExtractor.ArchiveFile($_)
foreach ($entry in $extractor.entries){
if ($entry.FileName -like "*.xml"){
[System.IO.MemoryStream]$memoryStream = New-Object System.IO.MemoryStream
$entry.Extract($memoryStream)
$x = New-Object Byte[] $memorystream.length
$seekpos = $memorystream.seek(0,0)
$streamlength =$memoryStream.Read($x,0,($memoryStream.length - 1))
# create XML doc
$XML = [xml]([System.Text.Encoding]::UTF8.GetString($x)).TrimEnd(0x00)
Select-Xml -XPath '//field[@name="Item_ID"]' -Xml $XML |
Select-Object -ExpandProperty node |
ForEach-Object{
$key = $_.'#text'.Trim()
if ($Ids.ContainsKey($key)){
$Ids.$key = 1
}
}
$memoryStream.Dispose()
}
}
}
$stopwatch.stopwatch
$stopwatch.Elapsed
$Ids.GetEnumerator()|
ForEach-Object{
if ($_.Value -eq 0){
$_.Key
}
} | Out-File -FilePath c:\Junk\UnmatchedIds.txt