FILE HANDLING
from PadhAI Class
padhai_fileHandling

File Handling

In [1]:
import numpy as np
In [2]:
planets_small = np.loadtxt("planets_small.txt")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-6f92f0fddb44> in <module>()
----> 1 planets_small = np.loadtxt("planets_small.txt")

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows, like)
   1146         # converting the data
   1147         X = None
-> 1148         for x in read_data(_loadtxt_chunksize):
   1149             if X is None:
   1150                 X = np.array(x, dtype)

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in read_data(chunk_size)
    997 
    998             # Convert each value according to its column and store
--> 999             items = [conv(val) for (conv, val) in zip(converters, vals)]
   1000 
   1001             # Then pack it according to the dtype's nesting

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in <listcomp>(.0)
    997 
    998             # Convert each value according to its column and store
--> 999             items = [conv(val) for (conv, val) in zip(converters, vals)]
   1000 
   1001             # Then pack it according to the dtype's nesting

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in floatconv(x)
    734         if '0x' in x:
    735             return float.fromhex(x)
--> 736         return float(x)
    737 
    738     typ = dtype.type

ValueError: could not convert string to float: 'MERCURY'
In [5]:
planets_small = np.loadtxt("planets_small.txt", skiprows=1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-0ff3b4620290> in <module>()
----> 1 planets_small = np.loadtxt("planets_small.txt", skiprows=1)

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows, like)
   1146         # converting the data
   1147         X = None
-> 1148         for x in read_data(_loadtxt_chunksize):
   1149             if X is None:
   1150                 X = np.array(x, dtype)

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in read_data(chunk_size)
    997 
    998             # Convert each value according to its column and store
--> 999             items = [conv(val) for (conv, val) in zip(converters, vals)]
   1000 
   1001             # Then pack it according to the dtype's nesting

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in <listcomp>(.0)
    997 
    998             # Convert each value according to its column and store
--> 999             items = [conv(val) for (conv, val) in zip(converters, vals)]
   1000 
   1001             # Then pack it according to the dtype's nesting

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in floatconv(x)
    734         if '0x' in x:
    735             return float.fromhex(x)
--> 736         return float(x)
    737 
    738     typ = dtype.type

ValueError: could not convert string to float: 'Mass'
In [8]:
planets_small = np.loadtxt("planets_small.txt", 
                            skiprows = 1,
                            usecols = (1,2,3,4,5,6,7,8,9))
In [9]:
planets_small
Out[9]:
array([[3.3000e-01, 4.8700e+00, 5.9700e+00, 6.4200e-01, 1.8980e+03,
        5.6800e+02, 8.6800e+01, 1.0200e+02, 1.4600e-02],
       [5.7900e+01, 1.0820e+02, 1.4960e+02, 2.2790e+02, 7.7860e+02,
        1.4335e+03, 2.8725e+03, 4.4951e+03, 5.9064e+03],
       [4.2226e+03, 2.8020e+03, 2.4000e+01, 2.4700e+01, 9.9000e+00,
        1.0700e+01, 1.7200e+01, 1.6100e+01, 1.5330e+02]])
In [10]:
planets_small.ndim
Out[10]:
2
In [11]:
planets_small.shape
Out[11]:
(3, 9)
In [12]:
planets = np.loadtxt("planets.txt", skiprows = 1,
                     usecols = (1,2,3,4,5,6,7,8,9))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-d084510916e4> in <module>()
      1 planets = np.loadtxt("planets.txt", skiprows = 1,
----> 2                      usecols = (1,2,3,4,5,6,7,8,9))

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows, like)
   1146         # converting the data
   1147         X = None
-> 1148         for x in read_data(_loadtxt_chunksize):
   1149             if X is None:
   1150                 X = np.array(x, dtype)

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in read_data(chunk_size)
    997 
    998             # Convert each value according to its column and store
--> 999             items = [conv(val) for (conv, val) in zip(converters, vals)]
   1000 
   1001             # Then pack it according to the dtype's nesting

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in <listcomp>(.0)
    997 
    998             # Convert each value according to its column and store
--> 999             items = [conv(val) for (conv, val) in zip(converters, vals)]
   1000 
   1001             # Then pack it according to the dtype's nesting

/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in floatconv(x)
    734         if '0x' in x:
    735             return float.fromhex(x)
--> 736         return float(x)
    737 
    738     typ = dtype.type

ValueError: could not convert string to float: 'Unknown'
In [13]:
planets = np.genfromtxt("planets.txt", skip_header = 1,
                        usecols = (1,2,3,4,5,6,7,8,9))
In [14]:
planets #the previous conversion error "UNKNOWN" is taken care here and 
#doesnot return error.
Out[14]:
array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000e+00,  1.07000e+01,  1.72000e+01,
         1.61000e+01],
       [ 5.79000e+01,  1.08200e+02,  1.49600e+02,  3.84000e-01,
         2.27900e+02,  7.78600e+02,  1.43350e+03,  2.87250e+03,
         4.49510e+03],
       [ 4.60000e+01,  1.07500e+02,  1.47100e+02,  3.63000e-01,
         2.06600e+02,  7.40500e+02,  1.35260e+03,  2.74130e+03,
         4.44450e+03],
       [ 6.98000e+01,  1.08900e+02,  1.52100e+02,  4.06000e-01,
         2.49200e+02,  8.16600e+02,  1.51450e+03,  3.00360e+03,
         4.54570e+03],
       [ 8.80000e+01,  2.24700e+02,  3.65200e+02,  2.73000e+01,
         6.87000e+02,  4.33100e+03,  1.07470e+04,  3.05890e+04,
         5.98000e+04],
       [ 4.74000e+01,  3.50000e+01,  2.98000e+01,  1.00000e+00,
         2.41000e+01,  1.31000e+01,  9.70000e+00,  6.80000e+00,
         5.40000e+00],
       [ 7.00000e+00,  3.40000e+00,  0.00000e+00,  5.10000e+00,
         1.90000e+00,  1.30000e+00,  2.50000e+00,  8.00000e-01,
         1.80000e+00],
       [ 2.05000e-01,  7.00000e-03,  1.70000e-02,  5.50000e-02,
         9.40000e-02,  4.90000e-02,  5.70000e-02,  4.60000e-02,
         1.10000e-02],
       [ 3.40000e-02,  1.77400e+02,  2.34000e+01,  6.70000e+00,
         2.52000e+01,  3.10000e+00,  2.67000e+01,  9.78000e+01,
         2.83000e+01],
       [ 1.67000e+02,  4.64000e+02,  1.50000e+01, -2.00000e+01,
        -6.50000e+01, -1.10000e+02, -1.40000e+02, -1.95000e+02,
        -2.00000e+02],
       [ 0.00000e+00,  9.20000e+01,  1.00000e+00,  0.00000e+00,
         1.00000e-02,          nan,          nan,          nan,
                 nan],
       [ 0.00000e+00,  0.00000e+00,  1.00000e+00,  0.00000e+00,
         2.00000e+00,  7.90000e+01,  8.20000e+01,  2.70000e+01,
         1.40000e+01],
       [ 0.00000e+00,  0.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  1.00000e+00,  1.00000e+00,  1.00000e+00,
         1.00000e+00],
       [ 1.00000e+00,  0.00000e+00,  1.00000e+00,  0.00000e+00,
         0.00000e+00,  1.00000e+00,  1.00000e+00,  1.00000e+00,
         1.00000e+00]])
In [15]:
planets.shape
Out[15]:
(20, 9)
In [17]:
planets.isnan()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-0eab59a6017c> in <module>()
----> 1 planets.isnan()

AttributeError: 'numpy.ndarray' object has no attribute 'isnan'
In [19]:
np.isnan(planets) # returns false where ever there is an nan
Out[19]:
array([[False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False,  True,  True,  True,  True],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False]])

to convert "NAN" to numpy readable formats or other chosen value => here as -1

In [21]:
planets_new = np.nan_to_num(planets, nan=-1)
In [22]:
planets_new
Out[22]:
array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000e+00,  1.07000e+01,  1.72000e+01,
         1.61000e+01],
       [ 5.79000e+01,  1.08200e+02,  1.49600e+02,  3.84000e-01,
         2.27900e+02,  7.78600e+02,  1.43350e+03,  2.87250e+03,
         4.49510e+03],
       [ 4.60000e+01,  1.07500e+02,  1.47100e+02,  3.63000e-01,
         2.06600e+02,  7.40500e+02,  1.35260e+03,  2.74130e+03,
         4.44450e+03],
       [ 6.98000e+01,  1.08900e+02,  1.52100e+02,  4.06000e-01,
         2.49200e+02,  8.16600e+02,  1.51450e+03,  3.00360e+03,
         4.54570e+03],
       [ 8.80000e+01,  2.24700e+02,  3.65200e+02,  2.73000e+01,
         6.87000e+02,  4.33100e+03,  1.07470e+04,  3.05890e+04,
         5.98000e+04],
       [ 4.74000e+01,  3.50000e+01,  2.98000e+01,  1.00000e+00,
         2.41000e+01,  1.31000e+01,  9.70000e+00,  6.80000e+00,
         5.40000e+00],
       [ 7.00000e+00,  3.40000e+00,  0.00000e+00,  5.10000e+00,
         1.90000e+00,  1.30000e+00,  2.50000e+00,  8.00000e-01,
         1.80000e+00],
       [ 2.05000e-01,  7.00000e-03,  1.70000e-02,  5.50000e-02,
         9.40000e-02,  4.90000e-02,  5.70000e-02,  4.60000e-02,
         1.10000e-02],
       [ 3.40000e-02,  1.77400e+02,  2.34000e+01,  6.70000e+00,
         2.52000e+01,  3.10000e+00,  2.67000e+01,  9.78000e+01,
         2.83000e+01],
       [ 1.67000e+02,  4.64000e+02,  1.50000e+01, -2.00000e+01,
        -6.50000e+01, -1.10000e+02, -1.40000e+02, -1.95000e+02,
        -2.00000e+02],
       [ 0.00000e+00,  9.20000e+01,  1.00000e+00,  0.00000e+00,
         1.00000e-02, -1.00000e+00, -1.00000e+00, -1.00000e+00,
        -1.00000e+00],
       [ 0.00000e+00,  0.00000e+00,  1.00000e+00,  0.00000e+00,
         2.00000e+00,  7.90000e+01,  8.20000e+01,  2.70000e+01,
         1.40000e+01],
       [ 0.00000e+00,  0.00000e+00,  0.00000e+00,  0.00000e+00,
         0.00000e+00,  1.00000e+00,  1.00000e+00,  1.00000e+00,
         1.00000e+00],
       [ 1.00000e+00,  0.00000e+00,  1.00000e+00,  0.00000e+00,
         0.00000e+00,  1.00000e+00,  1.00000e+00,  1.00000e+00,
         1.00000e+00]])

Save into a file into a human readable format

In [23]:
np.savetxt('planets_new.txt', planets_new, delimiter=",")
  • Save a file in numpy readable format, which is more efficient and less disk space consuming
  • file is saved in .npy format
In [25]:
np.save("planets_new", planets_new)

list files in the default container with ls command with ! command - to run native unix commands

In [26]:
!ls
cric_data.tsv	 planets_new.txt    planets.txt
planets_new.npy  planets_small.txt  sample_data
In [27]:
!ls -l
total 28
-rw-r--r-- 1 root root 2952 Mar  8 19:19 cric_data.tsv
-rw-r--r-- 1 root root 1568 Mar  8 19:41 planets_new.npy
-rw-r--r-- 1 root root 4512 Mar  8 19:39 planets_new.txt
-rw-r--r-- 1 root root  254 Mar  8 19:20 planets_small.txt
-rw-r--r-- 1 root root 1436 Mar  8 19:20 planets.txt
drwxr-xr-x 1 root root 4096 Mar  7 14:45 sample_data
In [28]:
!ls -lh
total 28K
-rw-r--r-- 1 root root 2.9K Mar  8 19:19 cric_data.tsv
-rw-r--r-- 1 root root 1.6K Mar  8 19:41 planets_new.npy
-rw-r--r-- 1 root root 4.5K Mar  8 19:39 planets_new.txt
-rw-r--r-- 1 root root  254 Mar  8 19:20 planets_small.txt
-rw-r--r-- 1 root root 1.5K Mar  8 19:20 planets.txt
drwxr-xr-x 1 root root 4.0K Mar  7 14:45 sample_data

to store multiple arrays in the same file

In [29]:
arr1 = np.random.rand(1000, 10)
arr2 = np.random.rand(2000, 20)
arr3 = np.random.rand(10, 10000)
In [30]:
np.savez("many_arrs", arr1, arr2, arr3)
In [32]:
!ls -lh
total 1.2M
-rw-r--r-- 1 root root 2.9K Mar  8 19:19 cric_data.tsv
-rw-r--r-- 1 root root 1.2M Mar  8 19:47 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Mar  8 19:41 planets_new.npy
-rw-r--r-- 1 root root 4.5K Mar  8 19:39 planets_new.txt
-rw-r--r-- 1 root root  254 Mar  8 19:20 planets_small.txt
-rw-r--r-- 1 root root 1.5K Mar  8 19:20 planets.txt
drwxr-xr-x 1 root root 4.0K Mar  7 14:45 sample_data

loading files

In [35]:
arrs = np.load("many_arrs.npz")
In [36]:
arrs.files
Out[36]:
['arr_0', 'arr_1', 'arr_2']
In [40]:
arrs['arr_1'].shape
Out[40]:
(2000, 20)
In [42]:
np.savez_compressed("many_arrs_compressed", arr1, arr2, arr3)
In [43]:
!ls -lh
total 2.3M
-rw-r--r-- 1 root root 2.9K Mar  8 19:19 cric_data.tsv
-rw-r--r-- 1 root root 1.1M Mar  8 19:52 many_arrs_compressed.npz
-rw-r--r-- 1 root root 1.2M Mar  8 19:47 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Mar  8 19:41 planets_new.npy
-rw-r--r-- 1 root root 4.5K Mar  8 19:39 planets_new.txt
-rw-r--r-- 1 root root  254 Mar  8 19:20 planets_small.txt
-rw-r--r-- 1 root root 1.5K Mar  8 19:20 planets.txt
drwxr-xr-x 1 root root 4.0K Mar  7 14:45 sample_data
In [45]:
npZeros = np.zeros((10000, 10000))
np.savez_compressed("npZeros_compressed", npZeros)
In [46]:
np.savez("npZeros", npZeros)
In [47]:
ls - lh
ls: cannot access '-': No such file or directory
ls: cannot access 'lh': No such file or directory
In [48]:
!ls -lh
total 766M
-rw-r--r-- 1 root root 2.9K Mar  8 19:19 cric_data.tsv
-rw-r--r-- 1 root root 1.1M Mar  8 19:52 many_arrs_compressed.npz
-rw-r--r-- 1 root root 1.2M Mar  8 19:47 many_arrs.npz
-rw-r--r-- 1 root root 760K Mar  8 19:54 npZeros_compressed.npz
-rw-r--r-- 1 root root 763M Mar  8 19:54 npZeros.npz
-rw-r--r-- 1 root root 1.6K Mar  8 19:41 planets_new.npy
-rw-r--r-- 1 root root 4.5K Mar  8 19:39 planets_new.txt
-rw-r--r-- 1 root root  254 Mar  8 19:20 planets_small.txt
-rw-r--r-- 1 root root 1.5K Mar  8 19:20 planets.txt
drwxr-xr-x 1 root root 4.0K Mar  7 14:45 sample_data
In [ ]: