1. How many time A-Z, a-z appear in text files in a folder |
2. Useful 'find' command for listing files and directories |
3. Writes each column name and value in CSV file to new line |
4. Search unique IP addresses in a text file |
5. The script take a directory path and display statistics about files, directories and top 5 largest files |
6. List number of files, directories, total size, top 5 file types and largest file
in a directory |
1. How many time A-Z, a-z appear in text files in a folder |
#!/bin/bash
list_char_counts() {
declare -i idx
myarr=(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
old_ifs="$IFS"
IFS=$'\n' #set the Internal Field Separator to newline so as to process line by line
cd $1
find ./*\.txt -print > /tmp/filelist
for i in `cat /tmp/filelist`
do
echo "$i"
for LINE in $(cat "$i")
do
temp=$(echo "$LINE" | sed "s/[^[:alpha:]]//g")
for (( j=0; j<${#temp}; j++ ));
do
mychar=$(echo "${temp:$j:1}")
ascmychar=$(printf "%d" "'$mychar")
if [ $ascmychar -le 90 -a $ascmychar -ge 65 ]
then
idx=$(($ascmychar - 65))
myarr[$idx]=$((${myarr[$idx]}+1))
elif [ $ascmychar -le 122 -a $ascmychar -ge 97 ]
then
idx=$(($ascmychar - 97))
myarr[$idx]=$((${myarr[$idx]}+1))
else
echo "something has gone wrong"
fi
done
done
done
IFS="$old_ifs"
one=1
count=0
for(( k=65;k<=90; k++ ))
do
p=`echo "$k" | awk '{printf("%c",$one)}'`
echo "$p - ${myarr[$count]}"
count=$(($count + $one))
done
}
echo "Please enter path to directory"
read mypath
echo "Entered path is : $mypath"
list_char_counts $mypath
|
Please enter path to directory
.
Entered path is : .
./marine.txt
./migration.txt
./nature.txt
./romans.txt
A - 923
B - 136
C - 422
D - 332
E - 1282
F - 228
G - 216
H - 367
I - 893
J - 12
K - 37
L - 407
M - 275
N - 904
O - 738
P - 231
Q - 7
R - 705
S - 764
T - 936
U - 260
V - 159
W - 86
X - 25
Y - 148
Z - 12
|
2. Useful 'find' command for listing files and directories |
1) List all the directories in folder
find /path/to/search -type d -print
.
./d
./c
2) List all the files in folder
find /path/to/search -type f -print
./d/d1.txt
./c/.c
./.z
./t1.txt
3) List all the non hidden files (files that do not start with .) in folder
find /path/to/search -not -path '*/.*' -type f -print
./d/d1.txt
./t1.txt
4) List all the hidden files (files that start with .) in folder
find /path/to/search -path '*/.*' -type f -print
./d/.d
./.z
./b/.b
5) Find the largest size (KB) file in a folder
find /path/to/search -type f -printf '%k %p\n' | sort -nr | head -1 | awk '{print \$1,\$2}'
152524 ./corpora/lin_thesaurus/simN.lsp
6) Find the smallest size (KB) file in a folder (e.g. current folder)
find . -type f -printf '%k %p\n' | sort -n | head -1 | awk '{print \$1,\$2}
0 ./corpora/english_wordnet/cousin.exc
7) Find empty files in a folder
find /path/to/search -type f -print -empty
./d/d1.txt
./d/.d
8) List top 10 file types (file extension) in a folder
find /path/to/search -type f | awk -F"." '{print \$NF}' | sort | uniq -c | awk '{print \$1" "\$2}' | sort -nr | head -n 10
29025 xml
3440 txt
1995 tbf
471 sgml
209 pos
199 prd
199 mrg
199 dp
160 wrd
160 wav
9) List all files in a folder having a given search pattern (e.g. alphabet)
find /path/to/search -name "*.txt" -exec grep "alphabet" {} \; -print | grep \.txt
./stemmers/porter_test/porter_vocabulary.txt
./corpora/pros_cons/IntegratedCons.txt
./corpora/webtext/firefox.txt
./corpora/comparative_sentences/labeledSentences.txt
./corpora/movie_reviews/neg/cv579_12542.txt
./corpora/qc/train.txt
|
3. Writes each column name and value in CSV file to new line |
#!/bin/bash
# What this program do
# 1. Takes in a directory containing csv files as first argument
# 2. Create list of csv files
# 3. Process each of the csv file in a loop
# 4. retrieve the extension and the filename(without extension) of the file
# 5. create an newfile name with same name as csv file but with extension txt
# 6. Write all the columns of the csv file in a new line (in new txt file created)
exceltotxt(){
old_ifs="$IFS"
IFS=$'\n'
cd $1
find . -name "*.csv" -print > filelist
for filename in `cat filelist`
do
echo $filename
extension="${filename##*.}"
newfilename="${filename%.*}.txt"
for LINE in $(cat "$filename")
do
echo "$LINE" | awk -F"," '{for(i=1;i<=NF;i++) print $i}'>$newfilename
done
done
IFS="$old_ifs"
rm filelist
}
if [ "$1" == "" ]
then
echo "Enter the directory path with csv files"
exit 1
fi
exceltotxt $1
|
Input csv file
--------------
OrderFirstName=Taya,OrderLastName=Skarda,OrderAddress1=7096 S Pierce ct,OrderAddress2=,OrderCity=Littleton,OrderState=US-CO,OrderZip=80128,OrderEmail=tayadb@yahoo.com,OrderReferenceID=GPHOTO30036,ExtLabCustomerID=,ExtOrderKey=2121084368903,ExtInstitution=Littleton Hockey Association,ExtSubjectName=Ryder Skarda,ExtSubjectGroup=12U A RED,ExtSubjectIdentifier=,ExtTeacher=,Index=1,Qty=2,Size=5x7,Tmp.text=RESET,Tmp.sku=,Tmp.prodfirstname=Ryder,Tmp.prodlastname=Skarda,Tmp.prodidentifier=,Tmp.prodgroup=12U A RED,Tmp.prodteacher=,Media=4 Wallets on 5x7,Template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd,Tmp.count_base=2,Tmp.size=5x7,Tmp.media=4 Wallets on 5x7,Tmp.template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd,Filepath=fotograf-187335232.jpg
Output txt file
---------------
OrderFirstName=Taya
OrderLastName=Skarda
OrderAddress1=7096 S Pierce ct
OrderAddress2=
OrderCity=Littleton
OrderState=US-CO
OrderZip=80128
OrderEmail=tayadb@yahoo.com
OrderReferenceID=GPHOTO30036
ExtLabCustomerID=
ExtOrderKey=2121084368903
ExtInstitution=Littleton Hockey Association
ExtSubjectName=Ryder Skarda
ExtSubjectGroup=12U A RED
ExtSubjectIdentifier=
ExtTeacher=
Index=1
Qty=2
Size=5x7
Tmp.text=RESET
Tmp.sku=
Tmp.prodfirstname=Ryder
Tmp.prodlastname=Skarda
Tmp.prodidentifier=
Tmp.prodgroup=12U A RED
Tmp.prodteacher=
Media=4 Wallets on 5x7
Template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd
Tmp.count_base=2
Tmp.size=5x7
Tmp.media=4 Wallets on 5x7
Tmp.template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd
Filepath=fotograf-187335232.jpg
|
4. Search unique IP addresses in a text file |
grep -oE '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}' ip.txt | sort | uniq -c |
Input file
----------
1-This is a random text 127.0.0.1 thanks for help
2-More random 192.168.0.1 tyty
3-Hello hi 127.0.0.1 goodbye
4-Welcome and 192.168.0.1 gg
5-Here we go 10.0.0.1 Good day
6-No 192.168.0.1 one
7-More random 192.168.100.1 hey
8-Hellooooooo 127.0.0.1 big kiss
Output
1 10.0.0.1
3 127.0.0.1
3 192.168.0.1
1 192.168.100.1
|
5. The script take a directory path and display statistics about files, directories and top 5 largest files |
#!/bin/bash
file_statistics() {
total_size=0
average_size=0
conv_factor=1024
sum=0
if [ ! -d "$1" ]; then
echo "Directory $1 does not exist."
exit 1
fi
cd $1
num_files=$(find . -type f -not -path '*/.*' -print | wc -l)
num_dirs=$(find . -type d -not -path '*/.*' -print | grep "\./" | wc -l)
if [ "$num_files" -ne "0" ]
then
total_size=$(find . -type f -not -path '*/.*' -print0 | xargs -0 ls -ls | awk '{print $6}' | awk '{ sum += $1 } END{ print sum }')
average_size=$(echo $total_size / $num_files | bc -l)
fi
total_size_kb=$(echo $total_size / $conv_factor | bc -l)
average_size_kb=$(echo $average_size / $conv_factor | bc -l)
echo "Statistics for $1"
echo "Total number of files: $num_files"
echo "Total number of directories: $num_dirs"
printf "Total size of all files (in kilobytes):%.1f KB\n" $total_size_kb
printf "Average size of a file (in kilobytes): %.1f KB\n" $average_size_kb
echo "Top 5 largest files:"
if [ "$num_files" -ne "0" ]
then
find . -type f -not -path '*/.*' -print0 | xargs -0 ls -ls | awk '{print $6" " $10}' | sort -hr | head -n 5 | awk '{printf("%d %s - %.1f KB\n",NR,$2,$1/1024);}'
fi
}
if [ "$1" == "" ]
then
echo "Directory not entered"
exit 1
fi
file_statistics $1
|
Output
-------
./file_statistics.sh ./nltk_data
Statistics for ./nltk_data
Total number of files: 38286
Total number of directories: 191
Total size of all files (in kilobytes):3534925.5 KB
Average size of a file (in kilobytes): 92.3 KB
Top 5 largest files:
1 ./corpora/lin_thesaurus/simN.lsp - 152518.8 KB
2 ./models/word2vec_sample/pruned.word2vec.txt - 135187.9 KB
3 ./corpora/framenet_v17.zip - 96882.0 KB
4 ./corpora/twitter_samples/tweets.20150430-223406.json - 92294.9 KB
5 ./corpora/lin_thesaurus.zip - 87064.5 KB
|
6. List number of files, directories, total size, top 5 file types and largest file
in a directory |
#!/bin/bash
directory_analyzer() {
opt=$2
conv_factor=$((1024*1024))
total_size=0
lsize=0
lfile=""
lfilename=""
if [ ! -d "$1" ]; then
echo "Directory $1 does not exist."
exit 1
fi
cd $1
if [ "$opt" == "yes" ]
then
num_files=$(find . -type f -print | wc -l)
num_dirs=$(find . -type d -print | grep "\./" | wc -l)
if [ "$num_files" -ne "0" ]
then
total_size=$(find . -type f -print0 | xargs -0 ls -ls | awk 'BEGIN{ sum = 0 }{ sum += $6 } END{ print sum }')
lsize=$(find . -type f -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f1)
lfile=$(find . -type f -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f2)
fi
elif [ "$opt" == "no" ]
then
num_files=$(find . -type f -not -path '*/.*' -print | wc -l)
num_dirs=$(find . -type d -not -path '*/.*' -print | grep "\./" | wc -l)
if [ "$num_files" -ne "0" ]
then
total_size=$(find . -type f -not -path '*/.*' -print0 | xargs -0 ls -ls | awk '{print $6}' | awk '{ sum += $1 } END{ print sum }')
lsize=$(find . -type f -not -path '*/.*' -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f1)
lfile=$(find . -type f -not -path '*/.*' -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f2)
fi
else
echo "Please enter 'yes' or 'no' option"
exit 1
fi
size_mb=$(echo $total_size / $conv_factor | bc -l)
lsize_mb=$(echo $lsize / $conv_factor | bc -l)
echo "Directory Analysis for $1:"
echo "Total number of files: $num_files"
echo "Total number of directories: $num_dirs"
printf "Total size of all files: %.1f MB\n" $size_mb
echo "Top 5 file types:"
if [ "$opt" == "yes" ]
then
if [ "$num_files" -ne "0" ]
then
find . -type f | awk -F"." '{print $NF}' | sort | uniq -c | awk '{print $1" "$2}' | sort -nr | head -n 5 | awk '{print NR".", $2 " - " $1 " files"}'
printf "Largest file:%s - %.1f MB\n" "$lfile" "$lsize_mb"
else
printf "Largest file:\n"
fi
fi
if [ "$opt" == "no" ]
then
if [ "$num_files" -ne "0" ]
then
find . -type f -not -path '*/.*' | awk -F"." '{print $NF}' | sort | uniq -c | awk '{print $1" "$2}' | sort -nr | head -n 5 | awk '{print NR".", $2 " - " $1 " files"}'
printf "Largest file: %s - %.1f MB\n" "$lfile" "$lsize_mb"
else
printf "Largest file:\n"
fi
fi
}
if [ "$1" == "" ]
then
echo "Directory not entered"
exit 1
fi
echo "Do you want to include hidden files in directory analysis (yes/no)"
read option
directory_analyzer $1 $option
|
Output
------
Do you want to include hidden files in directory analysis (yes/no)
yes
Directory Analysis for /home/devinder/Desktop/:
Total number of files: 2000
Total number of directories: 597
Total size of all files: 1057.9 MB
Top 5 file types:
1. pdf - 468 files
2. txt - 452 files
3. png - 246 files
4. jpg - 121 files
5. jpeg - 120 files
Largest file:./DEVINDER/test/dummy/dummy.pst - 263.5 MB
|
|