Finding the no of files and directories in a path
От | Nipuna |
---|---|
Тема | Finding the no of files and directories in a path |
Дата | |
Msg-id | CAPCz1_0JYiqSbO9CCpuqZsAMPq4LH9C+Mw6nQguoRGZQHE+eAg@mail.gmail.com обсуждение исходный текст |
Список | pgsql-novice |
Hi,
I have a table with 45 million records.I have 3 columns say, file_size, kb_alloc, misc_attr
The misc_attr column is the path. The sample record is given below
"57971"; "65536"; "FAu -- /gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/respell.map.spelling"
"851"; "8192"; "FAu -- /gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/rewriteForm.map"
"8192" "16384"; "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/txt/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/log/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/time/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"4487"; "8192"; "FAu -- /gpfs/scratch05/data/lm/scratch/yi_su/numbered-list/data.old/gpfs/static02/data/lm/data/english/text/scansoft/english-us/medical/transcend/90/90.47915.2000.UNKNOWN.txt.in.gz"
"8192"; "16384"; "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-12.00.300"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.00.100"
"7" "8192" "FAu -- /gpfs/scratch05/data/lm/scratch/sophie_charignon/plato/jpn/.hg/undo.branch"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.02.100"
I want a table which gives me the directory count and file count and their file size.My table should look like this;
dir_path kb_alloc USED_SIZE folder_count file_count
"/gpfs/scratch05/data/lm/build_weekly/vocabs"; 49152; 37249; 9; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak"; 49152; 37249; 8; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR"; 49152; 37249; 7; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR/legal"; 49152; 37249; 6; 1
"/gpfs/scratch05/data/lm/scratch/erik_larsson"; 2093824; 21783001; 59; 25
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep"; 21651456; 21620619; 30; 7
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds"; 21602304 ; 1572331; 22; 6
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds/lm"; 21602304; 21572331; 21; 6
I have written a function to create the table in this format. But a per the calculation it might the take 600+ hours to return the result. Is there any way I could optimize the function I have written. Below is the function I have written;
-- Function: hierarchial_file_structure()
-- DROP FUNCTION hierarchial_file_structure();
CREATE OR REPLACE FUNCTION hierarchial_file_structure()
RETURNS void AS
$BODY$
DECLARE
file_size1 BIGINT;
folder_count1 integer;
kb_alloc1 BIGINT;
misc_attr1 TEXT;
folder_names TEXT ARRAY;
path_names TEXT ARRAY;
current_path TEXT;
isNewRow BOOLEAN;
mFILES RECORD;
DataType TEXT;
temp_count BIGINT;
sqlcount bigint;
BEGIN
DELETE FROM file_structure;
sqlcount :=0;
FOR mFILES IN SELECT CAST(coalesce(file_size, '0') AS bigint) as file_size, CAST(coalesce(kb_alloc, '0') AS bigint) as kb_alloc, misc_attr FROM storage where (substring(misc_attr,1,1)) in ('F','D')
LOOP
file_size1 := mFILES.file_size;
kb_alloc1 := mFILES.kb_alloc;
misc_attr1 := mFILES.misc_attr;
DataType :=(substring(misc_attr1,1,1));
folder_names := regexp_split_to_array(misc_attr1, '/');
current_path := '';
-- LOOP THROUGH FOLDER NAMES AND CREATE THE PATH NAMES
IF DataType = 'F' THEN
-- IGNORE THE LAST ELEMENT SINCE ITS THE FILE NAME
FOR iFolderArrayCount IN 2..(array_length(folder_names,1)-1)
LOOP
current_path := current_path || '/' || folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;
END LOOP;
ELSE
FOR iFolderArrayCount IN 2..(array_length(folder_names,1))
LOOP
current_path := current_path || '/' || folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;
END LOOP;
END IF;
--LOOP THROUH THE PATH NAMES
FOR iPathArrayCount IN 1..(array_length(path_names,1))
LOOP
temp_count := array_length(path_names,1) - iPathArrayCount;
select into folder_count1 count(*) from file_structure where dir_path = ANY(path_names[iPathArrayCount+1:array_length(path_names,1)]);
folder_count1 := temp_count - folder_count1;
current_path := path_names[iPathArrayCount];
IF EXISTS(select distinct(dir_path) from file_structure where dir_path = current_path) THEN
--Row Exists hence update the count using the count of folders that does not exist in the summary table
IF(DataType = 'F') THEN
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
file_count = file_count +1,
folder_count = folder_count+ folder_count1
where dir_path = current_path;
ELSE
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
folder_count = folder_count+folder_count1
where dir_path = current_path;
END IF;
ELSE
IF(DataType = 'F') THEN
INSERT INTO file_structure (dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1, file_size1,folder_count1,1);
ELSE
INSERT INTO file_structure (dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1, file_size1,folder_count1,0);
END IF;
END IF;
END LOOP;
END LOOP;
END;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 100;
ALTER FUNCTION hierarchial_file_structure()
OWNER TO postgres;
Please let me know if there is a better way to write this function.Thanks in advance
--
I have a table with 45 million records.I have 3 columns say, file_size, kb_alloc, misc_attr
The misc_attr column is the path. The sample record is given below
"57971"; "65536"; "FAu -- /gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/respell.map.spelling"
"851"; "8192"; "FAu -- /gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/rewriteForm.map"
"8192" "16384"; "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/txt/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/log/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/time/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"4487"; "8192"; "FAu -- /gpfs/scratch05/data/lm/scratch/yi_su/numbered-list/data.old/gpfs/static02/data/lm/data/english/text/scansoft/english-us/medical/transcend/90/90.47915.2000.UNKNOWN.txt.in.gz"
"8192"; "16384"; "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-12.00.300"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.00.100"
"7" "8192" "FAu -- /gpfs/scratch05/data/lm/scratch/sophie_charignon/plato/jpn/.hg/undo.branch"
"8192" "16384" "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.02.100"
I want a table which gives me the directory count and file count and their file size.My table should look like this;
dir_path kb_alloc USED_SIZE folder_count file_count
"/gpfs/scratch05/data/lm/build_weekly/vocabs"; 49152; 37249; 9; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak"; 49152; 37249; 8; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR"; 49152; 37249; 7; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR/legal"; 49152; 37249; 6; 1
"/gpfs/scratch05/data/lm/scratch/erik_larsson"; 2093824; 21783001; 59; 25
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep"; 21651456; 21620619; 30; 7
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds"; 21602304 ; 1572331; 22; 6
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds/lm"; 21602304; 21572331; 21; 6
I have written a function to create the table in this format. But a per the calculation it might the take 600+ hours to return the result. Is there any way I could optimize the function I have written. Below is the function I have written;
-- Function: hierarchial_file_structure()
-- DROP FUNCTION hierarchial_file_structure();
CREATE OR REPLACE FUNCTION hierarchial_file_structure()
RETURNS void AS
$BODY$
DECLARE
file_size1 BIGINT;
folder_count1 integer;
kb_alloc1 BIGINT;
misc_attr1 TEXT;
folder_names TEXT ARRAY;
path_names TEXT ARRAY;
current_path TEXT;
isNewRow BOOLEAN;
mFILES RECORD;
DataType TEXT;
temp_count BIGINT;
sqlcount bigint;
BEGIN
DELETE FROM file_structure;
sqlcount :=0;
FOR mFILES IN SELECT CAST(coalesce(file_size, '0') AS bigint) as file_size, CAST(coalesce(kb_alloc, '0') AS bigint) as kb_alloc, misc_attr FROM storage where (substring(misc_attr,1,1)) in ('F','D')
LOOP
file_size1 := mFILES.file_size;
kb_alloc1 := mFILES.kb_alloc;
misc_attr1 := mFILES.misc_attr;
DataType :=(substring(misc_attr1,1,1));
folder_names := regexp_split_to_array(misc_attr1, '/');
current_path := '';
-- LOOP THROUGH FOLDER NAMES AND CREATE THE PATH NAMES
IF DataType = 'F' THEN
-- IGNORE THE LAST ELEMENT SINCE ITS THE FILE NAME
FOR iFolderArrayCount IN 2..(array_length(folder_names,1)-1)
LOOP
current_path := current_path || '/' || folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;
END LOOP;
ELSE
FOR iFolderArrayCount IN 2..(array_length(folder_names,1))
LOOP
current_path := current_path || '/' || folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;
END LOOP;
END IF;
--LOOP THROUH THE PATH NAMES
FOR iPathArrayCount IN 1..(array_length(path_names,1))
LOOP
temp_count := array_length(path_names,1) - iPathArrayCount;
select into folder_count1 count(*) from file_structure where dir_path = ANY(path_names[iPathArrayCount+1:array_length(path_names,1)]);
folder_count1 := temp_count - folder_count1;
current_path := path_names[iPathArrayCount];
IF EXISTS(select distinct(dir_path) from file_structure where dir_path = current_path) THEN
--Row Exists hence update the count using the count of folders that does not exist in the summary table
IF(DataType = 'F') THEN
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
file_count = file_count +1,
folder_count = folder_count+ folder_count1
where dir_path = current_path;
ELSE
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
folder_count = folder_count+folder_count1
where dir_path = current_path;
END IF;
ELSE
IF(DataType = 'F') THEN
INSERT INTO file_structure (dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1, file_size1,folder_count1,1);
ELSE
INSERT INTO file_structure (dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1, file_size1,folder_count1,0);
END IF;
END IF;
END LOOP;
END LOOP;
END;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 100;
ALTER FUNCTION hierarchial_file_structure()
OWNER TO postgres;
Please let me know if there is a better way to write this function.Thanks in advance
--
Regards,
Nipuna
В списке pgsql-novice по дате отправления:
Предыдущее
От: Tom LaneДата:
Сообщение: Re: Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1
Следующее
От: Zach SeamanДата:
Сообщение: Re: [NOVICE] Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1