Articles
1. How many time A-Z, a-z appear in text files in a folder
2. Useful 'find' command for listing files and directories
3. Writes each column name and value in CSV file to new line
4. Search unique IP addresses in a text file
5. The script take a directory path and display statistics about files, directories and top 5 largest files
6. List number of files, directories, total size, top 5 file types and largest file in a directory
1. How many time A-Z, a-z appear in text files in a folder

#!/bin/bash

list_char_counts() {

  declare -i idx
  myarr=(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
  old_ifs="$IFS"
  IFS=$'\n' #set the Internal Field Separator to newline so as to process line by line

  cd $1
  find ./*\.txt -print > /tmp/filelist

  for i in `cat /tmp/filelist`
  do
    echo "$i"
    for LINE in $(cat "$i")
    do
      temp=$(echo "$LINE" | sed "s/[^[:alpha:]]//g")
      for (( j=0; j<${#temp}; j++ ));
      do
        mychar=$(echo "${temp:$j:1}")
        ascmychar=$(printf "%d" "'$mychar")
        if [ $ascmychar -le 90 -a $ascmychar -ge 65 ]
        then
          idx=$(($ascmychar - 65))
          myarr[$idx]=$((${myarr[$idx]}+1))
        elif [ $ascmychar -le 122 -a $ascmychar -ge 97 ]
        then
          idx=$(($ascmychar - 97))
          myarr[$idx]=$((${myarr[$idx]}+1))
        else
          echo "something has gone wrong"
        fi
      done
    done
  done

  IFS="$old_ifs"

  one=1
  count=0
  for(( k=65;k<=90; k++ ))
  do
    p=`echo "$k" | awk '{printf("%c",$one)}'`
    echo "$p - ${myarr[$count]}"
    count=$(($count + $one))
  done
}

echo "Please enter path to directory"
read mypath
echo "Entered path is : $mypath"

list_char_counts $mypath

Please enter path to directory
.
Entered path is : .
./marine.txt
./migration.txt
./nature.txt
./romans.txt
A - 923
B - 136
C - 422
D - 332
E - 1282
F - 228
G - 216
H - 367
I - 893
J - 12
K - 37
L - 407
M - 275
N - 904
O - 738
P - 231
Q - 7
R - 705
S - 764
T - 936
U - 260
V - 159
W - 86
X - 25
Y - 148
Z - 12
2. Useful 'find' command for listing files and directories

1) List all the directories in folder

find /path/to/search -type d -print

.
./d
./c

2) List all the files in folder 

find /path/to/search -type f -print

./d/d1.txt
./c/.c
./.z
./t1.txt

3) List all the non hidden files (files that do not start with .) in folder 

find /path/to/search -not -path '*/.*' -type f -print

./d/d1.txt
./t1.txt

4) List all the hidden files (files that start with .) in folder

find /path/to/search -path '*/.*' -type f -print

./d/.d
./.z
./b/.b

5) Find the largest size (KB) file in a folder
 
find /path/to/search -type f -printf '%k %p\n' | sort -nr | head -1 | awk '{print \$1,\$2}'

152524 ./corpora/lin_thesaurus/simN.lsp

6) Find the smallest size (KB) file in a folder (e.g. current folder)

find . -type f -printf '%k %p\n' | sort -n | head -1 | awk '{print \$1,\$2}

0 ./corpora/english_wordnet/cousin.exc

7) Find empty files in a folder

find /path/to/search -type f -print -empty

./d/d1.txt
./d/.d

8) List top 10 file types (file extension) in a folder

find /path/to/search -type f | awk -F"." '{print \$NF}' | sort | uniq -c | awk '{print \$1" "\$2}' | sort -nr | head -n 10

29025 xml
3440 txt
1995 tbf
471 sgml
209 pos
199 prd
199 mrg
199 dp
160 wrd
160 wav

9) List all files in a folder having a given search pattern (e.g. alphabet)

find /path/to/search -name "*.txt" -exec grep "alphabet" {} \; -print  | grep \.txt

./stemmers/porter_test/porter_vocabulary.txt
./corpora/pros_cons/IntegratedCons.txt
./corpora/webtext/firefox.txt
./corpora/comparative_sentences/labeledSentences.txt
./corpora/movie_reviews/neg/cv579_12542.txt
./corpora/qc/train.txt
3. Writes each column name and value in CSV file to new line

#!/bin/bash

# What this program do
# 1. Takes in a directory containing csv files as first argument
# 2. Create list of csv files
# 3. Process  each of the csv file in a loop
# 4. retrieve the extension and the filename(without extension) of the file
# 5. create an newfile name with same name as csv file but with extension txt
# 6. Write all the columns of the csv file in a new line (in new txt file created)

exceltotxt(){

  old_ifs="$IFS"
  IFS=$'\n'
  cd $1
  find . -name "*.csv" -print > filelist
  for filename in `cat filelist`
  do
   echo $filename
   extension="${filename##*.}"
   newfilename="${filename%.*}.txt"
   for LINE in $(cat "$filename")
    do
      echo "$LINE" |  awk -F"," '{for(i=1;i<=NF;i++) print $i}'>$newfilename
    done
  done
  IFS="$old_ifs"
  rm filelist
}

if [ "$1" == "" ]
then
  echo "Enter the directory path with csv files"
  exit 1
fi
exceltotxt $1

Input csv file
--------------

OrderFirstName=Taya,OrderLastName=Skarda,OrderAddress1=7096 S Pierce ct,OrderAddress2=,OrderCity=Littleton,OrderState=US-CO,OrderZip=80128,OrderEmail=tayadb@yahoo.com,OrderReferenceID=GPHOTO30036,ExtLabCustomerID=,ExtOrderKey=2121084368903,ExtInstitution=Littleton Hockey Association,ExtSubjectName=Ryder Skarda,ExtSubjectGroup=12U A RED,ExtSubjectIdentifier=,ExtTeacher=,Index=1,Qty=2,Size=5x7,Tmp.text=RESET,Tmp.sku=,Tmp.prodfirstname=Ryder,Tmp.prodlastname=Skarda,Tmp.prodidentifier=,Tmp.prodgroup=12U A RED,Tmp.prodteacher=,Media=4 Wallets on 5x7,Template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd,Tmp.count_base=2,Tmp.size=5x7,Tmp.media=4 Wallets on 5x7,Tmp.template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd,Filepath=fotograf-187335232.jpg

Output txt file
---------------

OrderFirstName=Taya
OrderLastName=Skarda
OrderAddress1=7096 S Pierce ct
OrderAddress2=
OrderCity=Littleton
OrderState=US-CO
OrderZip=80128
OrderEmail=tayadb@yahoo.com
OrderReferenceID=GPHOTO30036
ExtLabCustomerID=
ExtOrderKey=2121084368903
ExtInstitution=Littleton Hockey Association
ExtSubjectName=Ryder Skarda
ExtSubjectGroup=12U A RED
ExtSubjectIdentifier=
ExtTeacher=
Index=1
Qty=2
Size=5x7
Tmp.text=RESET
Tmp.sku=
Tmp.prodfirstname=Ryder
Tmp.prodlastname=Skarda
Tmp.prodidentifier=
Tmp.prodgroup=12U A RED
Tmp.prodteacher=
Media=4 Wallets on 5x7
Template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd
Tmp.count_base=2
Tmp.size=5x7
Tmp.media=4 Wallets on 5x7
Tmp.template=X:\Templates\Borders\yspn borders\gotphoto\4 wallets.crd
Filepath=fotograf-187335232.jpg
4. Search unique IP addresses in a text file
grep -oE '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}' ip.txt | sort | uniq -c

Input file
----------
1-This is a random text 127.0.0.1 thanks for help
2-More random 192.168.0.1 tyty
3-Hello hi 127.0.0.1 goodbye
4-Welcome and 192.168.0.1 gg
5-Here we go 10.0.0.1 Good day
6-No 192.168.0.1 one
7-More random 192.168.100.1 hey
8-Hellooooooo 127.0.0.1 big kiss

Output
1 10.0.0.1
3 127.0.0.1
3 192.168.0.1
1 192.168.100.1
5. The script take a directory path and display statistics about files, directories and top 5 largest files

#!/bin/bash

file_statistics() {

  total_size=0
  average_size=0
  conv_factor=1024
  sum=0

  if [ ! -d "$1" ]; then
    echo "Directory $1 does not exist."
    exit 1
  fi

  cd $1
  num_files=$(find . -type f -not -path '*/.*' -print | wc -l)
  num_dirs=$(find . -type d -not -path '*/.*' -print | grep "\./" | wc -l)

  if [ "$num_files" -ne "0" ]
  then
    total_size=$(find . -type f -not -path '*/.*' -print0 | xargs -0 ls -ls | awk '{print $6}' | awk '{ sum += $1 } END{ print sum }')
    average_size=$(echo $total_size / $num_files | bc -l)
  fi

  total_size_kb=$(echo $total_size / $conv_factor | bc -l)
  average_size_kb=$(echo $average_size / $conv_factor | bc -l)

  echo "Statistics for $1"
  echo "Total number of files: $num_files"
  echo "Total number of directories: $num_dirs"
  printf "Total size of all files (in kilobytes):%.1f KB\n" $total_size_kb
  printf "Average size of a file (in kilobytes): %.1f KB\n" $average_size_kb
  echo "Top 5 largest files:"

  if [ "$num_files" -ne "0" ]
  then
    find . -type f -not -path '*/.*' -print0 | xargs -0 ls -ls | awk '{print $6" " $10}' | sort -hr | head -n 5 | awk '{printf("%d %s - %.1f KB\n",NR,$2,$1/1024);}'
  fi
}

if [ "$1" == "" ]
then
  echo "Directory not entered"
  exit 1
fi

file_statistics $1

Output
-------
./file_statistics.sh ./nltk_data

Statistics for ./nltk_data
Total number of files: 38286
Total number of directories: 191
Total size of all files (in kilobytes):3534925.5 KB
Average size of a file (in kilobytes): 92.3 KB
Top 5 largest files:
1 ./corpora/lin_thesaurus/simN.lsp - 152518.8 KB
2 ./models/word2vec_sample/pruned.word2vec.txt - 135187.9 KB
3 ./corpora/framenet_v17.zip - 96882.0 KB
4 ./corpora/twitter_samples/tweets.20150430-223406.json - 92294.9 KB
5 ./corpora/lin_thesaurus.zip - 87064.5 KB
6. List number of files, directories, total size, top 5 file types and largest file in a directory

#!/bin/bash

directory_analyzer() {

  opt=$2
  conv_factor=$((1024*1024))
  total_size=0
  lsize=0
  lfile=""
  lfilename=""


  if [ ! -d "$1" ]; then
    echo "Directory $1 does not exist."
    exit 1
  fi

  cd $1

  if [ "$opt" == "yes" ]
  then
    num_files=$(find . -type f -print | wc -l)
    num_dirs=$(find . -type d -print | grep "\./" | wc -l)
    if [ "$num_files" -ne "0" ]
    then
      total_size=$(find . -type f -print0 | xargs -0 ls -ls | awk 'BEGIN{ sum = 0 }{ sum += $6 } END{ print sum }')
      lsize=$(find . -type f -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f1)
      lfile=$(find . -type f -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f2)
    fi
  elif [ "$opt" == "no" ]
  then
    num_files=$(find . -type f -not -path '*/.*' -print | wc -l)
    num_dirs=$(find . -type d -not -path '*/.*' -print | grep "\./" | wc -l)
    if [ "$num_files" -ne "0" ]
    then
      total_size=$(find . -type f -not -path '*/.*' -print0 | xargs -0 ls -ls | awk '{print $6}' | awk '{ sum += $1 } END{ print sum }')
      lsize=$(find . -type f -not -path '*/.*' -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f1)
      lfile=$(find . -type f -not -path '*/.*' -printf '%s\t%p\n' | sort -nr -k1,1 | head -1 | cut -f2)
    fi
  else
    echo "Please enter 'yes' or 'no' option"
    exit 1
  fi
  size_mb=$(echo $total_size / $conv_factor | bc -l) 
  lsize_mb=$(echo $lsize / $conv_factor | bc -l)
  echo "Directory Analysis for $1:"
  echo "Total number of files: $num_files"
  echo "Total number of directories: $num_dirs"
  printf "Total size of all files: %.1f MB\n" $size_mb
  echo "Top 5 file types:"
  if [ "$opt" == "yes" ]
  then
    if [ "$num_files" -ne "0" ]
    then
      find . -type f | awk -F"." '{print $NF}' | sort | uniq -c | awk '{print $1" "$2}' | sort -nr | head -n 5 | awk '{print NR".", $2 " - " $1 " files"}'
      printf "Largest file:%s - %.1f MB\n" "$lfile" "$lsize_mb"
    else
      printf "Largest file:\n" 
    fi
  fi

  if [ "$opt" == "no" ]
  then
    if [ "$num_files" -ne "0" ]
    then
      find . -type f -not -path '*/.*' | awk -F"." '{print $NF}' | sort | uniq -c | awk '{print $1" "$2}' | sort -nr | head -n 5 | awk '{print NR".", $2 " - " $1 " files"}'
      printf "Largest file: %s - %.1f MB\n" "$lfile" "$lsize_mb"
    else
      printf "Largest file:\n"
    fi
  fi
}

if [ "$1" == "" ]
then
  echo "Directory not entered"
  exit 1
fi

echo "Do you want to include hidden files in directory analysis (yes/no)"
read option
directory_analyzer $1 $option

Output
------
Do you want to include hidden files in directory analysis (yes/no)
yes
Directory Analysis for /home/devinder/Desktop/:
Total number of files: 2000
Total number of directories: 597
Total size of all files: 1057.9 MB
Top 5 file types:
1. pdf - 468 files
2. txt - 452 files
3. png - 246 files
4. jpg - 121 files
5. jpeg - 120 files
Largest file:./DEVINDER/test/dummy/dummy.pst - 263.5 MB