Decrypt in chunks a ASE 128 CBC encrypted object

I have an Encrypted object in Minio, encrypted using the ASE 128 bit CBC algorithm.

The object is quite large (~50 MB) so instead of loading it into the memory completely (which may cause out of memory exception), I am retrieving it in chunks of 1MB. I need to decrypt it before use.

Is it possible to decrypt the object in this way (1MB at a time, the whole object was encrypted in one go)? If yes, how can I do it? I have tried decrypting 16-byte chunks which produce the following errors:

javax.crypto.BadPaddingException: Given final block not properly padded

javax.crypto.IllegalBlockSizeException: Input length must be multiple of 16 when decrypting with padded cipher

Converts audio files to fixed size chunks and the chunks to spectrogram images

This code takes in input as audio files (.wav or .WAV) and divides them into fixed-size (chunkSize in seconds) samples. Then these chunks are converted to spectrogram images after applying PCEN (Per-Channel Energy Normalization) and then wavelet denoising using librosa.

My concern now is how to improve the performance and speed up this whole process of conversion. The code actually maintains the directory structure of the parent which is essential for my project.

I’ll be running this on Google Colab over approx. ~50k audio files of varying durations from 5min – 25mins. But eventually, the script needs to run better and fast in my local computer. Is there a way to use parallel-processing here? Currently while running this on Google Colab, in the conversion stage of audio chunks to images and saving them, after saving around ~1k images, it eats up all the RAM and stops.

#!/usr/bin/env python3 # coding=utf-8  # Imports  import os import argparse import matplotlib.pyplot as plt import librosa import librosa.display import numpy as np  from skimage.restoration import (denoise_wavelet, estimate_sigma) from pydub import AudioSegment   plt.rcParams.update({'figure.max_open_warning': 0})   def padding(data, input_length):     '''Padding of samples to make them of same length'''     if len(data) > input_length:         max_offset = len(data) - input_length         offset = np.random.randint(max_offset)         data = data[offset:(input_length + offset)]     else:         if input_length > len(data):             max_offset = input_length - len(data)             offset = np.random.randint(max_offset)         else:             offset = 0         data = np.pad(data, (offset, input_length - len(data) - offset), "constant")     return data   def audio_norm(data):     '''Normalization of audio'''     max_data = np.max(data)     min_data = np.min(data)     data = (data - min_data) / (max_data - min_data + 1e-6)     return data - 0.5   def mfcc(data, sampling_rate, n_mfcc):     '''Compute mel-scaled feature using librosa'''     data = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)     # data = np.expand_dims(data, axis=-1)     return data   def pcen(data, sampling_rate):     '''Compute Per-Channel Energy Normalization (PCEN)'''     S = librosa.feature.melspectrogram(         data, sr=sampling_rate, power=1)  # Compute mel-scaled spectrogram     # Convert an amplitude spectrogram to dB-scaled spectrogram     log_S = librosa.amplitude_to_db(S, ref=np.max)     pcen_S = librosa.core.pcen(S)     return pcen_S   def wavelet_denoising(data):     '''     Wavelet Denoising using scikit-image     NOTE: Wavelet denoising is an effective method for SNR improvement in environments with               wide range of noise types competing for the same subspace.     '''     sigma_est = estimate_sigma(data, multichannel=True, average_sigmas=True)     im_bayes = denoise_wavelet(data, multichannel=False, convert2ycbcr=True, method='BayesShrink',                                mode='soft')     im_visushrink = denoise_wavelet(data, multichannel=False, convert2ycbcr=True, method='VisuShrink',                                     mode='soft')      # VisuShrink is designed to eliminate noise with high probability, but this     # results in a visually over-smooth appearance. Here, we specify a reduction     # in the threshold by factors of 2 and 4.     im_visushrink2 = denoise_wavelet(data, multichannel=False, convert2ycbcr=True, method='VisuShrink',                                      mode='soft', sigma=sigma_est / 2)     im_visushrink4 = denoise_wavelet(data, multichannel=False, convert2ycbcr=True, method='VisuShrink',                                      mode='soft', sigma=sigma_est / 4)     return im_bayes   def set_rate(audio, rate):     return audio.set_frame_rate(rate)   def make_chunks(filename, chunk_size, sampling_rate, target_location):     '''Divide the audio file into chunk_size samples'''     f = AudioSegment.from_wav(filename)      if f.frame_rate != sampling_rate:         f = set_rate(f, sampling_rate)     j = 0      # Make folder to maintain same directory structure     if not os.path.exists(target_location):         os.makedirs(target_location)      # Change to current folder     os.chdir(target_location)      # Get file name     f_name = os.path.basename(filename)      while len(f[:]) >= chunk_size * 1000:         chunk = f[:chunk_size * 1000]         chunk.export(f_name[:-4] + "_" + str(j) + ".wav", format="wav")         print("File stored at " + f_name[:-4] + "_" + str(j) + ".wav")         f = f[chunk_size * 1000:]         j += 1      if 0 < len(f[:]) and len(f[:]) < chunk_size * 1000:         silent = AudioSegment.silent(duration=chunk_size * 1000)         paddedData = silent.overlay(f, position=0, times=1)         paddedData.export(f_name[:-4] + "_" + str(j) + ".wav", format="wav")         print("File stored at " + f_name[:-4] + "_" + str(j) + ".wav")   def main(args):     sampling_rate = args.resampling     audio_duration = args.dur     use_mfcc = args.mfcc     n_mfcc = args.nmfcc     file_path = args.classpath     chunkSize = args.chunks      audio_length = sampling_rate * audio_duration     def preprocessing_fn(x): return x     input_length = audio_length      no_of_files = len(os.listdir('.'))      # Traverse all files inside each sub-folder and make chunks of audio file     for dirs, subdirs, files in os.walk(file_path):         for file in files:             if file.endswith(('.wav', '.WAV')):                 print(f"Making chunks of size {chunkSize}s of file: {file}")                  # Make chunks of data of chunk_size                 input_file = f"{dirs}" + str("/") + file                  # Get Current working directory to make parent folders                 w_d = os.getcwd()                 output_path = "PreProcessed_audi/" + str(dirs) + str("/")                  '''                 CouldntDecodeError: Decoding failed. ffmpeg returned error                 code: 1 in file ._20180605_0645_AD8.wav 2018, so catching exception                 '''                 try:                     make_chunks(                         input_file,                         chunkSize,                         sampling_rate,                         output_path)                 except Exception as e:                     print(f"Exception: {e}")                     pass                  # Change to parent directory to make parent sub-folders                 os.chdir(w_d)      # file_path now directs to the path with all the audio chunks     file_path = "PreProcessed_audio/data"      print(f"Starting to load {no_of_files} data files in the directory")     print(f"All files will be resampled to {sampling_rate}Hz")      for dirs, subdirs, files in os.walk(file_path):         for i, file in enumerate(files):             if file.endswith(('.wav', '.WAV')):                 print(f"Pre-Processing file: {file}")                 data, sr = librosa.core.load(                     f"{dirs}" + str("/") + file, sr=sampling_rate, res_type='kaiser_fast')                 tar_path = "PreProcessed_image/" + str(dirs) + str("/")                  # There is no need to apply padding since all samples are of same length                 # apply padding                 # padded_data = padding(data, input_length)                  # TODO: mismatch of shape                 # if use_mfcc:                 #     mfcc_data = mfcc(padded_data, sampling_rate, n_mfcc)                 # else:                 #     mfcc_data = preprocessing_fn(padded_data)[:, np.newaxis]                  # apply Per-Channel Energy Normalization                 pcen_S = pcen(data, sr)                  # apply Wavelet Denoising                 denoised_data = wavelet_denoising(pcen_S)                  # Get Current working directory to make parent folders                 w_d = os.getcwd()                  # Make folders (with sub-folders) to maintain same directory                 # structure                 if not os.path.exists(tar_path):                     os.makedirs(tar_path)                  # Change to current folder                 os.chdir(tar_path)                  # Get file name                 f_name = os.path.basename(file)                  # Plotting and Saving                 fig, ax = plt.subplots()                  # Add this line to show plots else ignore warnings                 # plt.ion()                  ax.imshow(denoised_data)                 ax.get_xaxis().set_visible(False)                 ax.get_yaxis().set_visible(False)                 fig.set_size_inches(10, 10)                 fig.savefig(                     f"{f_name[:-4]}_{i}.png",                     dpi=80,                     bbox_inches="tight",                     quality=95,                     pad_inches=0.0)                  fig.canvas.draw()                 fig.canvas.flush_events()                  # Change to parent directory to make parent sub-folders                 os.chdir(w_d)   if __name__ == '__main__':      parser = argparse.ArgumentParser(         description="Pre-Process the audio files and save as spectrogram images")     parser.add_argument(         '-c',         '--classpath',         type=str,         help='directory with list of classes',         required=True)     parser.add_argument(         '-r',         '--resampling',         type=int,         default=44100,         help='choose sampling rate')     parser.add_argument(         '-d',         "--dur",         type=int,         default=2,         help='Max duration (in seconds) of each clip')     parser.add_argument(         '-s',         "--chunks",         type=int,         default=5,         help='Chunk Size for each sample to be divided to')     parser.add_argument(         '-m',         "--mfcc",         type=bool,         default=False,         help='apply mfcc')     parser.add_argument(         '-n',         "--nmfcc",         type=int,         default=20,         help='Number of mfcc to return')      args = parser.parse_args()      main(args) 

How to inflate deflated chunks from .mca files in c#?

Using this site as a reference https://minecraft.gamepedia.com/Region_file_format, I am able to parse the first 8 kb of a .mca file and extract the chunk/section index. I then try to decompress the first block located at 4096*2+5 = 8197 bytes. It has a 4 byte big endian integer indicating the length is 645 bytes including the 1 byte compression type indicating zlib compression. So 644 bytes of compressed data after the 1 byte compression type. This data starts with the zlib magic bytes 78 9c followed by 3 bits indicating a final block with a dynamic compression type according to RFC 1951. But my first problem is that when I look at the hexdump of the .mca file it looks like there is actually 1,230 bytes of data before empty zero padding. My second problem is that when I pass this compressed data to my unity c# System.IO.Compression.DeflateStream it throws an exception IOException: Corrupted data ReadInternal thrown from the CheckResult method. So I assume that it is not getting a proper checksum.

Anyone know what I am doing wrong?

SafeBlockPopulator to ensure neighboring chunks are generated before population

Bukkit/CraftBukkit/Spigot are Minecraft server extenders that allow for creating and using plugins. A particular type of plugin is a generator, which creates the game world. Generators operate chunk-by-chunk, where a chunk is a 16x16x256 area of blocks. First, the generator creates the terrain of the chunk. Then, the generator populates the chunk with extra “stuff.” A generator can have any number of populators.

The “problem” with populators is that they may attempt to populate outside of the bounds of the current chunk, so in another chunk. If the other chunk does not yet exist, the generator will then attempt to generate and populate that chunk, and so on until the server crashes from locked threads waiting for all of the neighboring chunks.

My solution is to create a “SafeBlockPopulator” that doesn’t attempt population until it is certain that all neighboring chunks exist. It uses a SQLite database to keep track of all of the chunks it has seen, and only actually attempts to populate a given chunk until all neighboring chunks within a given radius exist in the database.

package com.michaelbmorris.generator;  import java.io.File; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.HashSet; import java.util.Random;  import org.bukkit.Chunk; import org.bukkit.World; import org.bukkit.generator.BlockPopulator;  /**  * Populates a chunk once all surrounding chunks within a radius are generated.  */ public abstract class SafeBlockPopulator extends BlockPopulator {     private static final HashSet<String> DatabaseUrls = new HashSet<String>();     private static final int DEFAULT_RADIUS = 1;      /*      * Statuses      */      private static final int STATUS_GENERATED = 0;     private static final int STATUS_POPULATED = 1;      /*      * SQL      */      private static final String CREATE_TABLE = "CREATE TABLE IF NOT EXISTS chunkCoordinate (x INTEGER NOT NULL, z INTEGER NOT NULL, status INTEGER NOT NULL, PRIMARY KEY (x, z));";     private static final String DELETE_GENERATED_CHUNKS = "DELETE FROM chunkCoordinate WHERE status = " + STATUS_GENERATED + ";";     private static final String GET_CHUNK = "SELECT * FROM chunkCoordinate WHERE x = ? AND z = ?;";     private static final String GET_GENERATED_CHUNKS = "SELECT * FROM chunkCoordinate WHERE status = " + STATUS_GENERATED + ";";     private static final String INSERT_CHUNK = "INSERT INTO chunkCoordinate (x, z, status) VALUES (?, ?, 0);";     private static final String RESET_CHUNK = "UPDATE chunkCoordinate SET status = " + STATUS_GENERATED + " WHERE x = ? AND z = ?;";     private static final String SET_CHUNK_POPULATED = "UPDATE chunkCoordinate SET status = " + STATUS_POPULATED + " WHERE x = ? AND z = ?;";      private static ResultSet getGeneratedChunks(Connection connection) throws SQLException {         PreparedStatement getGeneratedChunks = connection.prepareStatement(GET_GENERATED_CHUNKS);         return getGeneratedChunks.executeQuery();     }      private static void insertOrResetChunk(int x, int z, Connection connection) throws SQLException {         PreparedStatement getChunk = connection.prepareStatement(GET_CHUNK);         getChunk.setInt(1, x);         getChunk.setInt(2, z);         ResultSet chunk = getChunk.executeQuery();          if (!chunk.next()) {             PreparedStatement insertChunk = connection.prepareStatement(INSERT_CHUNK);             insertChunk.setInt(1, x);             insertChunk.setInt(2, z);             insertChunk.executeUpdate();         } else {             PreparedStatement resetChunk = connection.prepareStatement(RESET_CHUNK);             resetChunk.setInt(1, x);             resetChunk.setInt(2, z);             resetChunk.executeUpdate();         }     }      private final HashMap<String, Chunk> chunks;     private final int radius;     private final String databaseUrl;      /**      * Creates a SafeBlockPopulator with the default radius of 1.      */     protected SafeBlockPopulator(String databaseUrl, boolean isNew) {         this(databaseUrl, isNew, DEFAULT_RADIUS);     }      /**      * Creates a SafeBlockPopulator with a specified radius.      */     protected SafeBlockPopulator(String databaseUrl, boolean isNew, int radius) {         if (databaseUrl == null || databaseUrl.isEmpty()) {             throw new IllegalArgumentException("Inheriting block populator must supply a URL for the SQLite database.");         }          if (DatabaseUrls.contains(databaseUrl)) {             throw new IllegalArgumentException("Each populator must have a unique database URL.");         }          if (radius < 1) {             throw new IllegalArgumentException("The radius must be at least 1.");         }          DatabaseUrls.add(databaseUrl);         this.radius = radius;         this.databaseUrl = "jdbc:sqlite:" + databaseUrl;           if (isNew) {             File database = new File(databaseUrl);             database.delete();         }          try (Connection connection = DriverManager.getConnection(this.databaseUrl)) {             Statement statement = connection.createStatement();             statement.execute(CREATE_TABLE);             statement.executeUpdate(DELETE_GENERATED_CHUNKS);         } catch (SQLException e) {             System.out.println(e.getMessage());         }          chunks = new HashMap<String, Chunk>();     }      @Override     public final void populate(World world, Random random, Chunk chunk) {         int x = chunk.getX();         int z = chunk.getZ();         chunks.put(x + " " + z, chunk);          try (Connection connection = DriverManager.getConnection(databaseUrl)) {             insertOrResetChunk(x, z, connection);             attemptPopulate(world, random, connection);         } catch (SQLException e) {             System.out.println(e.getMessage());         }     }      private void attemptPopulate(World world, Random random, Connection connection) throws SQLException {         ResultSet unpopulatedChunks = getGeneratedChunks(connection);         PreparedStatement setChunkPopulated = connection.prepareStatement(SET_CHUNK_POPULATED);          while (unpopulatedChunks.next()) {             if (unpopulatedChunks.getInt("status") == STATUS_GENERATED) {                 int chunkX = unpopulatedChunks.getInt("x");                 int chunkZ = unpopulatedChunks.getInt("z");                  if (hasSurrounding(connection, chunkX, chunkZ)) {                     Chunk chunk;                     String key = chunkX + " " + chunkZ;                      if (chunks.containsKey(key)) {                         chunk = chunks.get(key);                         chunks.remove(key);                     } else {                         chunk = world.getChunkAt(chunkX, chunkZ);                     }                      actuallyPopulate(world, random, chunk);                     setChunkPopulated.setInt(1, unpopulatedChunks.getInt("x"));                     setChunkPopulated.setInt(2, unpopulatedChunks.getInt("z"));                     setChunkPopulated.executeUpdate();                 }             }         }     }      private boolean hasSurrounding(Connection connection, int x, int z) throws SQLException {         PreparedStatement getChunk = connection.prepareStatement(GET_CHUNK);         ResultSet resultSet;          for (int i = 0 - radius; i <= radius; i++) {             for (int j = 0 - radius; j <= radius; j++) {                 getChunk.setInt(1, x + i);                 getChunk.setInt(2, z + j);                 resultSet = getChunk.executeQuery();                  if (!resultSet.next()) {                     return false;                 }             }         }          return true;     }      /**      * Actually populates this chunk once all surrounding chunks within the radius      * are generated.      */     protected abstract void actuallyPopulate(World world, Random random, Chunk chunk); } 

This obviously causes somewhat reduced performance. The database interactions are costly compared to keeping track of the data in memory, but persistence is required for when the server restarts. Theoretically, it would be nice to load all the data from the database on startup and save it all back on shutdown, but there’s no guarantee that the server will be shutdown correctly. I do have to keep the chunks themselves in memory because there’s another bug in the server that causes chunks to be “forgotten” if they are generated but not populated for a while. That’s why I delete all generated but not populated chunks from the database on startup; the server consistently sends all of them through again.

Larger radii decrease performance as well, but it’s up to the author of the inheriting class to determine how much room their populator implementation needs.

Is there any way to optimize/reduce my database calls while still ensuring that all data is persisted immediately? In particular, I’m looking at the hasSurrounding() method that makes (2 * radius + 1)^2 amount of database calls. Are there any other improvements I could make?

how to divide data in chunks and after process every chunk result should save in dataframe

i have dataframe with 6000 rows i want to divide this dataframe in chunks have 500 data points , then process chunks , first chunk after process will give result and result will have two columns x and y . same for other chunks every chunk will give result x and y. i want to save final result after comparing results of all chunks.

for example compare x and y result of all chunks, and in final save result of that chunk where y is greater that all other chunks. i tried this piece of code , but the problem is i get only x and y results of last chunk.

    chunk_size = 500     for start in range(500, df.shape[0], chunk_size):         df_subset = df.iloc[start:start + chunk_size]         #df1=df1.append(df_subset)         x=df_subset['diff']         fs=50.0             #########Xfft########         len_data=len(x)         amp_x=2*abs(np.fft.fft(x-np.mean(x)))/len_data         fren_x=fs*(np.array(range(0,len_data)))/len_data         frequency=fren_x[0:(len(fren_x)//2)]         amplitude=amp_x[0:(len(amp_x)//2)]         dx=pd.DataFrame({'x':frequency,'y':amplitude})         freq_x_max=dx.loc[dx['y'].idxmax()]         freq_x_max=pd.DataFrame(freq_x_max)         fft_result=freq_x_max.T 

trying to merge chunks to trigger better balancing after 50% of the data was deleted by the developers

Trying to merge chunks using the following command:

         db.adminCommand          ( {            mergeChunks: "HTMLDumps.HTMLRepository",            bounds: [ {   "ShardMapId" : 2, "DomainId" : 62 },            {  "ShardMapId" : 2, "DomainId" : 162 } ]          } ) 

getting the following error when trying to run the above command to try to merge any of the available consecutive chunks available on a shard:

             {              "ok" : 0,              "errmsg" : "Failed to commit chunk merge :: caused by ::               DuplicateKey: chunk operation commit failed: version               32|6||5ba8d864bba4ff264edf0bd9 doesn't exist in               namespace: HTMLDumps.HTMLRepository. Unable to save               chunk ops. Command: { applyOps: [ { op: \"u\", b: false,               ns: \"config.chunks\", o: { _id: \"HTM               Dumps.HTMLRepository-ShardMapId_2.0DomainId_62.0\", ns:               \"HTMLDumps.HTMLRepository\", min: { ShardMapId: 2.0,               DomainId: 62.0 }, max: { ShardMapId: 2, DomainId: 162 },               shard: \"shard0000\", lastmod: Timestamp(32, 6),               lastmodEpoch: ObjectId('5ba8d864bba4ff264edf0bd9') },               o2: { _id: \"HTMLDumps.HTMLRepository-               ShardMapId_2.0DomainId_62.0\" } }, { op: \"d\", ns:               \"config.chunks\", o: { _id: \"HTMLDumps.HTMLRepository-               ShardMapId_2DomainId_109\" } } ], preCondition: [ { ns:               \"config.chunks\", q: { query: { ns:               \"HTMLDumps.HTMLRepository\", min: { ShardMapId: 2.0,               DomainId: 62.0 }, max: { ShardMapId: 2, DomainId: 109 }               }, orderby: { lastmod: -1 } }, res: { lastmodEpoch:               ObjectId('5ba8d864bba4ff264edf0bd9'), shard:               \"shard0000\" } }, { ns: \"config.chunks\", q: { query:               { ns: \"HTMLDumps.HTMLRepository\", min: { ShardMapId:               2, DomainId: 109 }, max: { ShardMapId: 2, DomainId: 162               } }, orderby: { lastmod: -1 } }, res: { lastmodEpoch:               ObjectId('5ba8d864bba4ff264edf0bd9'), shard:               \"shard0000\" } } ], writeConcern: { w: 0, wtimeout: 0 }               }. Result: { applied: 1, code: 11000, codeName:               \"DuplicateKey\", errmsg: \"E11000 duplicate key error               collection: config.chunks index: ns_1_min_1 dup key: { :               \"HTMLDumps.HTMLRepository\", : { ShardMapId: 2.0,               DomainId: 62.0 } }\", results: [ false ], ok: 0.0,               operationTime: Timestamp(1554112692, 1), $  gleStats: {               lastOpTime: { ts: Timestamp(1554112692, 1), t: 13 },                electionId: ObjectId('7fffffff000000000000000d') },               $  clusterTime: { clusterTime: Timestamp(1554112692, 1),               signature: { hash: BinData(0,               0000000000000000000000000000000000000000), keyId: 0 } }               } :: caused by :: E11000 duplicate key error collection:               config.chunks index: ns_1_min_1 dup key: { :               \"HTMLDumps.HTMLRepository\", : { ShardMapId: 2.0,               DomainId: 62.0 } }",              "code" : 11000,              "codeName" : "DuplicateKey",              "operationTime" : Timestamp(1554112687, 1),              "$  clusterTime" : {              "clusterTime" : Timestamp(1554112687, 1),              "signature" : {                 "hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),                 "keyId" : NumberLong(0)                }               }              } 

This is happening regardless of which chunks i select. Main reason for me to try to do this is to achieve true data balancing and not just chunk numbers. Recently developers deleted 90% of the data from these chunks that caused the distribution to get to 90% 10% state from 60/40 earlier. I hope to merge/remove empty chunks to ensure balancing of the data to reach as close to 60/40 as possible.

How to bundle vendor chunks which are in node_module directory

Is there a way to include chunk files created by dynamic import inside dependent module as chunk file in main application.

Here is what I am trying –

I have two npm modules, say M1, M2.

M2 dynamically imports M1 using import() operator. While bundling M2, I am using webpackChunkName to create vendor~M1.js

M2.js  ...  import(/* webpackChunkName: "M1" */ 'M1/index') .then(){}  

Then there is a react application A1 that statically imports M2.

A1.js  import index from 'M2'  ...  

While bundling A1, I am using splitChunks to create M2bundle.js for M2

webpack.config.js looks like this –

        splitChunks: {           cacheGroups: {             default: false,             commons: {               test: /[\/]node_modules[\/]/,               name: 'M2bundle',               chunks: 'all',               minChunks: 1,             },           },         }, 

Output of this creates main.js for A1.js, M2bundle.js for M2 module but it does not include vendor~M1.js which is in node_module/M2 directory.

Is there a way in webpack config to achieve this ?

Appreciate the help !!!

Python Pandas NLTK: Adding Frequency Counts or Importance Scoring to Part of Speech Chunks on Dataframe Text Column

I did NLTK part of speech tagging followed by chunking on one column (“train_text”) inside my Pandas data frame.

Below is my code that ran successfully and sample output results.

def process_content():     try:         for i in train_text:             words = nltk.word_tokenize(i)             tagged = nltk.pos_tag(words)             # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""             chunkGram = r"""Chunk: {<VB.?><NN.?>}"""             chunkParser = nltk.RegexpParser(chunkGram)             chunked = chunkParser.parse(tagged)              for subtree in chunked.subtrees(filter = lambda t: t.label() == 'Chunk'):                 print (subtree)      except Exception as e:         print(str(e))  process_content() 

Results: “xxx” stands for a word; in the first instance it is a verb and in the second instance it is a noun

(Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  

Now that I have the chunks of words, I want to find the 10 most frequently occurring or prominent Verb + Noun chunks. Is there any way I can attach a frequency or importance score to each chunk?

Python Pandas NLTK: Adding Frequency Counts or Importance Scoring to Part of Speech Chunks on Dataframe Text Column

I did NLTK part of speech tagging followed by chunking on one column (“train_text”) inside my Pandas data frame.

Below is my code that ran successfully and sample output results.

def process_content():     try:         for i in train_text:             words = nltk.word_tokenize(i)             tagged = nltk.pos_tag(words)             # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""             chunkGram = r"""Chunk: {<VB.?><NN.?>}"""             chunkParser = nltk.RegexpParser(chunkGram)             chunked = chunkParser.parse(tagged)              for subtree in chunked.subtrees(filter = lambda t: t.label() == 'Chunk'):                 print (subtree)      except Exception as e:         print(str(e))  process_content() 

Results: “xxx” stands for a word; in the first instance it is a verb and in the second instance it is a noun

(Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  (Chunk xxx/VBN xxx/NN)  

Now that I have the chunks of words, I want to find the 10 most frequently occurring or prominent Verb + Noun chunks. Is there any way I can attach a frequency or importance score to each chunk?