Fork (copy) JBD2 from JBD code base. Signed-Off-By: Mingming Cao --- /dev/null | 268 --- linux-2.6.18-rc6-ming/fs/jbd2/Makefile | 7 linux-2.6.18-rc6-ming/fs/jbd2/checkpoint.c | 696 ++++++++ linux-2.6.18-rc6-ming/fs/jbd2/commit.c | 867 ++++++++++ linux-2.6.18-rc6-ming/fs/jbd2/journal.c | 2079 +++++++++++++++++++++++ linux-2.6.18-rc6-ming/fs/jbd2/recovery.c | 592 ++++++ linux-2.6.18-rc6-ming/fs/jbd2/revoke.c | 703 ++++++++ linux-2.6.18-rc6-ming/fs/jbd2/transaction.c | 2080 ++++++++++++++++++++++++ linux-2.6.18-rc6-ming/include/linux/ext4_jbd2.h | 268 +++ linux-2.6.18-rc6-ming/include/linux/jbd2.h | 1099 ++++++++++++ 10 files changed, 8391 insertions(+), 268 deletions(-) diff -puN /dev/null fs/jbd2/checkpoint.c --- /dev/null 2006-09-07 07:04:09.114355360 -0700 +++ linux-2.6.18-rc6-ming/fs/jbd2/checkpoint.c 2006-09-07 23:05:04.832144396 -0700 @@ -0,0 +1,696 @@ +/* + * linux/fs/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + +#include +#include +#include +#include +#include + +/* + * Unlink a buffer from a transaction checkpoint list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink_first(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) { + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; +} + +/* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * + * Requires j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __try_to_free_cp_buf(struct journal_head *jh) +{ + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + ret = __journal_remove_checkpoint(jh) + 1; + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + BUFFER_TRACE(bh, "release"); + __brelse(bh); + } else { + jbd_unlock_bh_state(bh); + } + return ret; +} + +/* + * __log_wait_for_space: wait until there is space in the journal. + * + * Called under j-state_lock *only*. It will be unlocked if we have to wait + * for a checkpoint to free up some space in the log. + */ +void __log_wait_for_space(journal_t *journal) +{ + int nblocks; + assert_spin_locked(&journal->j_state_lock); + + nblocks = jbd_space_needed(journal); + while (__log_space_left(journal) < nblocks) { + if (journal->j_flags & JFS_ABORT) + return; + spin_unlock(&journal->j_state_lock); + mutex_lock(&journal->j_checkpoint_mutex); + + /* + * Test again, another process may have checkpointed while we + * were waiting for the checkpoint lock + */ + spin_lock(&journal->j_state_lock); + nblocks = jbd_space_needed(journal); + if (__log_space_left(journal) < nblocks) { + spin_unlock(&journal->j_state_lock); + log_do_checkpoint(journal); + spin_lock(&journal->j_state_lock); + } + mutex_unlock(&journal->j_checkpoint_mutex); + } +} + +/* + * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. + * The caller must restart a list walk. Wait for someone else to run + * jbd_unlock_bh_state(). + */ +static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) +{ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_lock_bh_state(bh); + jbd_unlock_bh_state(bh); + put_bh(bh); +} + +/* + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that we take the buffers in the opposite ordering + * from the one in which they were submitted for IO. + * + * Called with j_list_lock held. + */ +static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + tid_t this_tid; + int released = 0; + + this_tid = transaction->t_tid; +restart: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return; + while (!released && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + /* + * Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list + */ + released = __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + } +} + +#define NR_BATCH 64 + +static void +__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) +{ + int i; + + ll_rw_block(SWRITE, *batch_count, bhs); + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = bhs[i]; + clear_buffer_jwrite(bh); + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; +} + +/* + * Try to flush one buffer from the checkpoint list to disk. + * + * Return 1 if something happened which requires us to abort the current + * scan of the checkpoint list. + * + * Called with j_list_lock held and drops it if 1 is returned + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __process_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count) +{ + struct buffer_head *bh = jh2bh(jh); + int ret = 0; + + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + ret = 1; + } else if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + ret = 1; + } else if (!buffer_dirty(bh)) { + J_ASSERT_JH(jh, !buffer_jbddirty(bh)); + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + ret = 1; + } else { + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal lock. + * We cannot afford to let the transaction logic start + * messing around with this buffer before we write it to + * disk, as that would break recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + set_buffer_jwrite(bh); + bhs[*batch_count] = bh; + __buffer_relink_io(jh); + jbd_unlock_bh_state(bh); + (*batch_count)++; + if (*batch_count == NR_BATCH) { + spin_unlock(&journal->j_list_lock); + __flush_batch(journal, bhs, batch_count); + ret = 1; + } + } + return ret; +} + +/* + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. + * + * The journal should be locked before calling this function. + */ +int log_do_checkpoint(journal_t *journal) +{ + transaction_t *transaction; + tid_t this_tid; + int result; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = cleanup_journal_tail(journal); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Take one transaction + * and write it. + */ + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; +restart: + /* + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). + */ + if (journal->j_checkpoint_transactions == transaction && + transaction->t_tid == this_tid) { + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + struct journal_head *jh; + int retry = 0; + + while (!retry && transaction->t_checkpoint_list) { + struct buffer_head *bh; + + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + retry = 1; + break; + } + retry = __process_buffer(journal, jh, bhs,&batch_count); + if (!retry && lock_need_resched(&journal->j_list_lock)){ + spin_unlock(&journal->j_list_lock); + retry = 1; + break; + } + } + + if (batch_count) { + if (!retry) { + spin_unlock(&journal->j_list_lock); + retry = 1; + } + __flush_batch(journal, bhs, &batch_count); + } + + if (retry) { + spin_lock(&journal->j_list_lock); + goto restart; + } + /* + * Now we have cleaned up the first transaction's checkpoint + * list. Let's clean up the second one + */ + __wait_cp_io(journal, transaction); + } +out: + spin_unlock(&journal->j_list_lock); + result = cleanup_journal_tail(journal); + if (result < 0) + return result; + return 0; +} + +/* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the journal superblock if + * we have an abort error outstanding. + */ + +int cleanup_journal_tail(journal_t *journal) +{ + transaction_t * transaction; + tid_t first_tid; + unsigned long blocknr, freed; + + /* OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * + * If the log is now empty, we need to work out which is the + * next transaction ID we will write, and where it will + * start. */ + + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = journal->j_head; + } else { + first_tid = journal->j_transaction_sequence; + blocknr = journal->j_head; + } + spin_unlock(&journal->j_list_lock); + J_ASSERT(blocknr != 0); + + /* If the oldest pinned transaction is at the tail of the log + already then there's not much we can do right now. */ + if (journal->j_tail_sequence == first_tid) { + spin_unlock(&journal->j_state_lock); + return 1; + } + + /* OK, update the superblock to recover the freed space. + * Physical blocks come first: have we wrapped beyond the end of + * the log? */ + freed = blocknr - journal->j_tail; + if (blocknr < journal->j_tail) + freed = freed + journal->j_last - journal->j_first; + + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, first_tid, blocknr, freed); + + journal->j_free += freed; + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + spin_unlock(&journal->j_state_lock); + if (!(journal->j_flags & JFS_ABORT)) + journal_update_superblock(journal, 1); + return 0; +} + + +/* Checkpoint list management */ + +/* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of bufers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret, freed = 0; + + *released = 0; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) { + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; + } + } + } + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of buffers reaped (for debug) + */ + +int __journal_clean_checkpoint_list(journal_t *journal) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + int ret = 0; + int released; + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + goto out; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + goto out; + if (released) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list, &released); + if (need_resched()) + goto out; + } while (transaction != last_transaction); +out: + return ret; +} + +/* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * lists until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * + * This function is called with the journal locked. + * This function is called with j_list_lock held. + * This function is called with jbd_lock_bh_state(jh2bh(jh)) + */ + +int __journal_remove_checkpoint(struct journal_head *jh) +{ + transaction_t *transaction; + journal_t *journal; + int ret = 0; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + journal = transaction->t_journal; + + __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) + goto out; + JBUFFER_TRACE(jh, "transaction has no more buffers"); + + /* + * There is one special case to worry about: if we have just pulled the + * buffer off a committing transaction's forget list, then even if the + * checkpoint list is empty, the transaction obviously cannot be + * dropped! + * + * The locking here around j_committing_transaction is a bit sleazy. + * See the comment at the end of journal_commit_transaction(). + */ + if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "belongs to committing transaction"); + goto out; + } + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + + __journal_drop_transaction(journal, transaction); + + /* Just in case anybody was waiting for more transactions to be + checkpointed... */ + wake_up(&journal->j_wait_logspace); + ret = 1; +out: + JBUFFER_TRACE(jh, "exit"); + return ret; +} + +/* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ +void __journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) +{ + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; +} + +/* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ + +void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) +{ + assert_spin_locked(&journal->j_list_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT(transaction->t_state == T_FINISHED); + J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_sync_datalist == NULL); + J_ASSERT(transaction->t_forget == NULL); + J_ASSERT(transaction->t_iobuf_list == NULL); + J_ASSERT(transaction->t_shadow_list == NULL); + J_ASSERT(transaction->t_log_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(transaction->t_updates == 0); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); + kfree(transaction); +} diff -puN /dev/null fs/jbd2/commit.c --- /dev/null 2006-09-07 07:04:09.114355360 -0700 +++ linux-2.6.18-rc6-ming/fs/jbd2/commit.c 2006-09-07 23:05:04.836143905 -0700 @@ -0,0 +1,867 @@ +/* + * linux/fs/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Default IO end handler for temporary BJ_IO buffer_heads. + */ +static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + BUFFER_TRACE(bh, ""); + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); +} + +/* + * When an ext3-ordered file is truncated, it is possible that many pages are + * not sucessfully freed, because they are attached to a committing transaction. + * After the transaction commits, these pages are left on the LRU, with no + * ->mapping, and with attached buffers. These pages are trivially reclaimable + * by the VM, but their apparent absence upsets the VM accounting, and it makes + * the numbers in /proc/meminfo look odd. + * + * So here, we have a buffer which has just come off the forget list. Look to + * see if we can strip all buffers from the backing page. + * + * Called under lock_journal(), and possibly under journal_datalist_lock. The + * caller provided us with a ref against the buffer, and we drop that here. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page; + + if (buffer_dirty(bh)) + goto nope; + if (atomic_read(&bh->b_count) != 1) + goto nope; + page = bh->b_page; + if (!page) + goto nope; + if (page->mapping) + goto nope; + + /* OK, it's a truncated page */ + if (TestSetPageLocked(page)) + goto nope; + + page_cache_get(page); + __brelse(bh); + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + return; + +nope: + __brelse(bh); +} + +/* + * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is + * held. For ranking reasons we must trylock. If we lose, schedule away and + * return 0. j_list_lock is dropped in this case. + */ +static int inverted_lock(journal_t *journal, struct buffer_head *bh) +{ + if (!jbd_trylock_bh_state(bh)) { + spin_unlock(&journal->j_list_lock); + schedule(); + return 0; + } + return 1; +} + +/* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_write_commit_record(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *descriptor; + struct buffer_head *bh; + int i, ret; + int barrier_done = 0; + + if (is_journal_aborted(journal)) + return 0; + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return 1; + + bh = jh2bh(descriptor); + + /* AKPM: buglet - add `i' to tmp! */ + for (i = 0; i < bh->b_size; i += 512) { + journal_header_t *tmp = (journal_header_t*)bh->b_data; + tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + } + + JBUFFER_TRACE(descriptor, "write commit block"); + set_buffer_dirty(bh); + if (journal->j_flags & JFS_BARRIER) { + set_buffer_ordered(bh); + barrier_done = 1; + } + ret = sync_dirty_buffer(bh); + /* is it possible for another commit to fail at roughly + * the same time as this one? If so, we don't want to + * trust the barrier flag in the super, but instead want + * to remember if we sent a barrier request + */ + if (ret == -EOPNOTSUPP && barrier_done) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", + bdevname(journal->j_dev, b)); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); + + /* And try again, without the barrier */ + clear_buffer_ordered(bh); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + ret = sync_dirty_buffer(bh); + } + put_bh(bh); /* One for getblk() */ + journal_put_journal_head(descriptor); + + return (ret == -EIO); +} + +/* + * journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ +void journal_commit_transaction(journal_t *journal) +{ + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; + int bufs; + int flags; + int err; + unsigned long blocknr; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + +#ifdef COMMIT_STATS + spin_lock(&journal->j_list_lock); + summarise_journal_usage(journal); + spin_unlock(&journal->j_list_lock); +#endif + + /* Do we need to erase the effects of a prior journal_flush? */ + if (journal->j_flags & JFS_FLUSHED) { + jbd_debug(3, "super block updated\n"); + journal_update_superblock(journal, 1); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + J_ASSERT(journal->j_running_transaction != NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + J_ASSERT(commit_transaction->t_state == T_RUNNING); + + jbd_debug(1, "JBD: starting commit of transaction %d\n", + commit_transaction->t_tid); + + spin_lock(&journal->j_state_lock); + commit_transaction->t_state = T_LOCKED; + + spin_lock(&commit_transaction->t_handle_lock); + while (commit_transaction->t_updates) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (commit_transaction->t_updates) { + spin_unlock(&commit_transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + schedule(); + spin_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); + + J_ASSERT (commit_transaction->t_outstanding_credits <= + journal->j_max_transaction_buffers); + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple journal_get_write_access() calls to the same + * buffer are perfectly permissable. + */ + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + /* + * A journal_get_undo_access()+journal_release_buffer() may + * leave undo-committed data. + */ + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + jbd_slab_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + jbd_unlock_bh_state(bh); + } + journal_refile_buffer(journal, jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal->j_list_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal->j_list_lock); + + jbd_debug (3, "JBD: commit phase 1\n"); + + /* + * Switch to a new revoke table. + */ + journal_switch_revoke_table(journal); + + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + commit_transaction->t_log_start = journal->j_head; + wake_up(&journal->j_wait_transaction_locked); + spin_unlock(&journal->j_state_lock); + + jbd_debug (3, "JBD: commit phase 2\n"); + + /* + * First, drop modified flag: all accesses to the buffers + * will be tracked for a new trasaction only -bzzz + */ + spin_lock(&journal->j_list_lock); + if (commit_transaction->t_buffers) { + new_jh = jh = commit_transaction->t_buffers->b_tnext; + do { + J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || + new_jh->b_modified == 0); + new_jh->b_modified = 0; + new_jh = new_jh->b_tnext; + } while (new_jh != jh); + } + spin_unlock(&journal->j_list_lock); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + + err = 0; + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + */ + bufs = 0; + /* + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + struct buffer_head *bh; + + jh = commit_transaction->t_sync_datalist; + commit_transaction->t_sync_datalist = jh->b_tnext; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + BUFFER_TRACE(bh, "locked"); + if (!inverted_lock(journal, bh)) + goto write_out_data; + __journal_temp_unlink_buffer(jh); + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } else { + if (buffer_dirty(bh)) { + BUFFER_TRACE(bh, "start journal writeout"); + get_bh(bh); + wbuf[bufs++] = bh; + if (bufs == journal->j_wbufsize) { + jbd_debug(2, "submit %d writes\n", + bufs); + spin_unlock(&journal->j_list_lock); + ll_rw_block(SWRITE, bufs, wbuf); + journal_brelse_array(wbuf, bufs); + bufs = 0; + goto write_out_data; + } + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + if (!inverted_lock(journal, bh)) + goto write_out_data; + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + put_bh(bh); + if (lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + } + } + + if (bufs) { + spin_unlock(&journal->j_list_lock); + ll_rw_block(SWRITE, bufs, wbuf); + journal_brelse_array(wbuf, bufs); + spin_lock(&journal->j_list_lock); + } + + /* + * Wait for all previously submitted IO to complete. + */ + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + spin_lock(&journal->j_list_lock); + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + put_bh(bh); + } else { + jbd_unlock_bh_state(bh); + } + put_bh(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (err) + __journal_abort_hard(journal); + + journal_write_revoke_records(journal, commit_transaction); + + jbd_debug(3, "JBD: commit phase 2\n"); + + /* + * If we found any dirty or locked buffers, then we should have + * looped back up to the write_out_data label. If there weren't + * any then journal_clean_data_list should have wiped the list + * clean by now, so check that it is in fact empty. + */ + J_ASSERT (commit_transaction->t_sync_datalist == NULL); + + jbd_debug (3, "JBD: commit phase 3\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + commit_transaction->t_state = T_COMMIT; + + descriptor = NULL; + bufs = 0; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it for background writing. */ + + if (is_journal_aborted(journal)) { + JBUFFER_TRACE(jh, "journal is aborting: refile"); + journal_refile_buffer(journal, jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + struct buffer_head *bh; + + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) { + __journal_abort_hard(journal); + continue; + } + + bh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %llu (%p)\n", + (unsigned long long)bh->b_blocknr, bh->b_data); + header = (journal_header_t *)&bh->b_data[0]; + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + space_left = bh->b_size - sizeof(journal_header_t); + first_tag = 1; + set_buffer_jwrite(bh); + set_buffer_dirty(bh); + wbuf[bufs++] = bh; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(bh, "ph3: file as descriptor"); + journal_file_buffer(descriptor, commit_transaction, + BJ_LogCtl); + } + + /* Where is the buffer to be written? */ + + err = journal_next_log_block(journal, &blocknr); + /* If the block mapping failed, just abandon the buffer + and repeat this loop: we'll fall into the + refile-on-abort condition above. */ + if (err) { + __journal_abort_hard(journal); + continue; + } + + /* + * start_this_handle() uses t_outstanding_credits to determine + * the free space in the log, but this counter is changed + * by journal_next_log_block() also. + */ + commit_transaction->t_outstanding_credits--; + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + atomic_inc(&jh2bh(jh)->b_count); + + /* Make a temporary IO buffer with which to write it out + (this will requeue both the metadata buffer and the + temporary IO buffer). new_bh goes on BJ_IO*/ + + set_bit(BH_JWrite, &jh2bh(jh)->b_state); + /* + * akpm: journal_write_metadata_buffer() sets + * new_bh->b_transaction to commit_transaction. + * We need to clean this up before we release new_bh + * (which is of type BJ_IO) + */ + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); + set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + wbuf[bufs++] = jh2bh(new_jh); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JFS_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JFS_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be32(tag_flag); + tagp += sizeof(journal_block_tag_t); + space_left -= sizeof(journal_block_tag_t); + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == journal->j_wbufsize || + commit_transaction->t_buffers == NULL || + space_left < sizeof(journal_block_tag_t) + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); + +start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE, bh); + } + cond_resched(); + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + + Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD: commit phase 4\n"); + + /* + * akpm: these are BJ_IO, and j_list_lock is not needed. + * See __journal_try_to_free_buffer. + */ +wait_for_iobuf: + while (commit_transaction->t_iobuf_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_iobuf_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_iobuf; + } + if (cond_resched()) + goto wait_for_iobuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + clear_buffer_jwrite(bh); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); + journal_unfile_buffer(journal, jh); + + /* + * ->t_iobuf_list should contain only dummy buffer_heads + * which were created by journal_write_metadata_buffer(). + */ + BUFFER_TRACE(bh, "dumping temporary bh"); + journal_put_journal_head(jh); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + free_buffer_head(bh); + + /* We also have to unlock and free the corresponding + shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_bit(BH_JWrite, &bh->b_state); + J_ASSERT_BH(bh, buffer_jbddirty(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + journal_file_buffer(jh, commit_transaction, BJ_Forget); + /* Wake up any transactions which were waiting for this + IO to complete */ + wake_up_bit(&bh->b_state, BH_Unshadow); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD: commit phase 5\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + wait_for_ctlbuf: + while (commit_transaction->t_log_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_ctlbuf; + } + if (cond_resched()) + goto wait_for_ctlbuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + journal_unfile_buffer(journal, jh); + journal_put_journal_head(jh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + jbd_debug(3, "JBD: commit phase 6\n"); + + if (journal_write_commit_record(journal, commit_transaction)) + err = -EIO; + + if (err) + __journal_abort_hard(journal); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_iobuf_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + J_ASSERT(commit_transaction->t_log_list == NULL); + +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); + bh = jh2bh(jh); + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || + jh->b_transaction == journal->j_running_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + */ + if (jh->b_committed_data) { + jbd_slab_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + } + } else if (jh->b_frozen_data) { + jbd_slab_free(jh->b_frozen_data, bh->b_size); + jh->b_frozen_data = NULL; + } + + spin_lock(&journal->j_list_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + __journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + + /* A buffer which has been freed while still being + * journaled by a previous transaction may end up still + * being dirty here, but we want to avoid writing back + * that buffer in the future now that the last use has + * been committed. That's not only a performance gain, + * it also stops aliasing problems if the buffer is left + * behind for writeback and gets reallocated for another + * use in a different page. */ + if (buffer_freed(bh)) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + } + + if (buffer_jbddirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __journal_insert_checkpoint(jh, commit_transaction); + JBUFFER_TRACE(jh, "refile for checkpoint writeback"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + /* The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. */ + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + if (!jh->b_transaction) { + jbd_unlock_bh_state(bh); + /* needs a brelse */ + journal_remove_journal_head(bh); + release_buffer_page(bh); + } else + jbd_unlock_bh_state(bh); + } + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We borrow j_list_lock to protect + * journal->j_committing_transaction in __journal_remove_checkpoint. + * Really, __journal_remove_checkpoint should be using j_state_lock but + * it's a bit hassle to hold that across __journal_remove_checkpoint + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { + spin_unlock(&journal->j_list_lock); + spin_unlock(&journal->j_state_lock); + goto restart_loop; + } + + /* Done with this transaction! */ + + jbd_debug(3, "JBD: commit phase 8\n"); + + J_ASSERT(commit_transaction->t_state == T_COMMIT); + + commit_transaction->t_state = T_FINISHED; + J_ASSERT(commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + spin_unlock(&journal->j_state_lock); + + if (commit_transaction->t_checkpoint_list == NULL) { + __journal_drop_transaction(journal, commit_transaction); + } else { + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + } + spin_unlock(&journal->j_list_lock); + + jbd_debug(1, "JBD: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + wake_up(&journal->j_wait_done_commit); +} diff -puN /dev/null fs/jbd2/journal.c --- /dev/null 2006-09-07 07:04:09.114355360 -0700 +++ linux-2.6.18-rc6-ming/fs/jbd2/journal.c 2006-09-07 23:05:04.845142802 -0700 @@ -0,0 +1,2079 @@ +/* + * linux/fs/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +EXPORT_SYMBOL(journal_start); +EXPORT_SYMBOL(journal_restart); +EXPORT_SYMBOL(journal_extend); +EXPORT_SYMBOL(journal_stop); +EXPORT_SYMBOL(journal_lock_updates); +EXPORT_SYMBOL(journal_unlock_updates); +EXPORT_SYMBOL(journal_get_write_access); +EXPORT_SYMBOL(journal_get_create_access); +EXPORT_SYMBOL(journal_get_undo_access); +EXPORT_SYMBOL(journal_dirty_data); +EXPORT_SYMBOL(journal_dirty_metadata); +EXPORT_SYMBOL(journal_release_buffer); +EXPORT_SYMBOL(journal_forget); +#if 0 +EXPORT_SYMBOL(journal_sync_buffer); +#endif +EXPORT_SYMBOL(journal_flush); +EXPORT_SYMBOL(journal_revoke); + +EXPORT_SYMBOL(journal_init_dev); +EXPORT_SYMBOL(journal_init_inode); +EXPORT_SYMBOL(journal_update_format); +EXPORT_SYMBOL(journal_check_used_features); +EXPORT_SYMBOL(journal_check_available_features); +EXPORT_SYMBOL(journal_set_features); +EXPORT_SYMBOL(journal_create); +EXPORT_SYMBOL(journal_load); +EXPORT_SYMBOL(journal_destroy); +EXPORT_SYMBOL(journal_update_superblock); +EXPORT_SYMBOL(journal_abort); +EXPORT_SYMBOL(journal_errno); +EXPORT_SYMBOL(journal_ack_err); +EXPORT_SYMBOL(journal_clear_err); +EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(journal_start_commit); +EXPORT_SYMBOL(journal_force_commit_nested); +EXPORT_SYMBOL(journal_wipe); +EXPORT_SYMBOL(journal_blocks_per_page); +EXPORT_SYMBOL(journal_invalidatepage); +EXPORT_SYMBOL(journal_try_to_free_buffers); +EXPORT_SYMBOL(journal_force_commit); + +static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +static void __journal_abort_soft (journal_t *journal, int errno); +static int journal_create_jbd_slab(size_t slab_size); + +/* + * Helper function used to manage commit timeouts + */ + +static void commit_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); +} + +/* + * kjournald: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + +static int kjournald(void *arg) +{ + journal_t *journal = arg; + transaction_t *transaction; + + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + setup_timer(&journal->j_commit_timer, commit_timeout, + (unsigned long)current); + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", + journal->j_commit_interval / HZ); + + /* + * And now, wait forever for commit wakeup events. + */ + spin_lock(&journal->j_state_lock); + +loop: + if (journal->j_flags & JFS_UNMOUNT) + goto end_loop; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + spin_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal_commit_transaction(journal); + spin_lock(&journal->j_state_lock); + goto loop; + } + + wake_up(&journal->j_wait_done_commit); + if (freezing(current)) { + /* + * The simpler the better. Flushing journal isn't a + * good idea, because that depends on threads that may + * be already stopped. + */ + jbd_debug(1, "Now suspending kjournald\n"); + spin_unlock(&journal->j_state_lock); + refrigerator(); + spin_lock(&journal->j_state_lock); + } else { + /* + * We assume on resume that commits are already there, + * so we don't sleep + */ + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&journal->j_wait_commit, &wait, + TASK_INTERRUPTIBLE); + if (journal->j_commit_sequence != journal->j_commit_request) + should_sleep = 0; + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, + transaction->t_expires)) + should_sleep = 0; + if (journal->j_flags & JFS_UNMOUNT) + should_sleep = 0; + if (should_sleep) { + spin_unlock(&journal->j_state_lock); + schedule(); + spin_lock(&journal->j_state_lock); + } + finish_wait(&journal->j_wait_commit, &wait); + } + + jbd_debug(1, "kjournald wakes\n"); + + /* + * Were we woken up by a commit wakeup event? + */ + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + goto loop; + +end_loop: + spin_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; +} + +static void journal_start_thread(journal_t *journal) +{ + kthread_run(kjournald, journal, "kjournald"); + wait_event(journal->j_wait_done_commit, journal->j_task != 0); +} + +static void journal_kill_thread(journal_t *journal) +{ + spin_lock(&journal->j_state_lock); + journal->j_flags |= JFS_UNMOUNT; + + while (journal->j_task) { + wake_up(&journal->j_wait_commit); + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, journal->j_task == 0); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); +} + +/* + * journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we *have* to lock the buffer to prevent anyone + * else from using and possibly modifying it while the IO is in + * progress. + * + * The function returns a pointer to the buffer_heads to be used for IO. + * + * We assume that the journal has already been locked in this function. + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + +int journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + int blocknr) +{ + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct journal_head *new_jh; + struct page *new_page; + unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + jbd_lock_bh_state(bh_in); +repeat: + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = offset_in_page(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = offset_in_page(jh2bh(jh_in)->b_data); + } + + mapped_data = kmap_atomic(new_page, KM_USER0); + /* + * Check for escaping + */ + if (*((__be32 *)(mapped_data + new_offset)) == + cpu_to_be32(JFS_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + kunmap_atomic(mapped_data, KM_USER0); + + /* + * Do we need to do a data copy? + */ + if (need_copy_out && !done_copy_out) { + char *tmp; + + jbd_unlock_bh_state(bh_in); + tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS); + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + jbd_slab_free(tmp, bh_in->b_size); + goto repeat; + } + + jh_in->b_frozen_data = tmp; + mapped_data = kmap_atomic(new_page, KM_USER0); + memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + kunmap_atomic(mapped_data, KM_USER0); + + new_page = virt_to_page(tmp); + new_offset = offset_in_page(tmp); + done_copy_out = 1; + } + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + if (do_escape) { + mapped_data = kmap_atomic(new_page, KM_USER0); + *((unsigned int *)(mapped_data + new_offset)) = 0; + kunmap_atomic(mapped_data, KM_USER0); + } + + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + jbd_unlock_bh_state(bh_in); + + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ + + set_bh_page(new_bh, new_page, new_offset); + new_jh->b_transaction = NULL; + new_bh->b_size = jh2bh(jh_in)->b_size; + new_bh->b_bdev = transaction->t_journal->j_dev; + new_bh->b_blocknr = blocknr; + set_buffer_mapped(new_bh); + set_buffer_dirty(new_bh); + + *jh_out = new_jh; + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + journal_file_buffer(jh_in, transaction, BJ_Shadow); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); + journal_file_buffer(new_jh, transaction, BJ_IO); + + return do_escape | (done_copy_out << 1); +} + +/* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + +/* + * __log_space_left: Return the number of free blocks left in the journal. + * + * Called with the journal already locked. + * + * Called under j_state_lock + */ + +int __log_space_left(journal_t *journal) +{ + int left = journal->j_free; + + assert_spin_locked(&journal->j_state_lock); + + /* + * Be pessimistic here about the number of those free blocks which + * might be required for log descriptor control blocks. + */ + +#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ + + left -= MIN_LOG_RESERVED_BLOCKS; + + if (left <= 0) + return 0; + left -= (left >> 3); + return left; +} + +/* + * Called under j_state_lock. Returns true if a transaction was started. + */ +int __log_start_commit(journal_t *journal, tid_t target) +{ + /* + * Are we already doing a recent enough commit? + */ + if (!tid_geq(journal->j_commit_request, target)) { + /* + * We want a new commit: OK, mark the request and wakup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + return 1; + } + return 0; +} + +int log_start_commit(journal_t *journal, tid_t tid) +{ + int ret; + + spin_lock(&journal->j_state_lock); + ret = __log_start_commit(journal, tid); + spin_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * We can only force the running transaction if we don't have an active handle; + * otherwise, we will deadlock. + * + * Returns true if a transaction was started. + */ +int journal_force_commit_nested(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return 0; /* Nothing to retry */ + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + return 1; +} + +/* + * Start a commit of the current running transaction (if any). Returns true + * if a transaction was started, and fills its tid in at *ptid + */ +int journal_start_commit(journal_t *journal, tid_t *ptid) +{ + int ret = 0; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) { + tid_t tid = journal->j_running_transaction->t_tid; + + ret = __log_start_commit(journal, tid); + if (ret && ptid) + *ptid = tid; + } else if (journal->j_committing_transaction && ptid) { + /* + * If ext3_write_super() recently started a commit, then we + * have to wait for completion of that transaction + */ + *ptid = journal->j_committing_transaction->t_tid; + ret = 1; + } + spin_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ +int log_wait_commit(journal_t *journal, tid_t tid) +{ + int err = 0; + +#ifdef CONFIG_JBD_DEBUG + spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_EMERG + "%s: error: j_commit_request=%d, tid=%d\n", + __FUNCTION__, journal->j_commit_request, tid); + } + spin_unlock(&journal->j_state_lock); +#endif + spin_lock(&journal->j_state_lock); + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, + !tid_gt(tid, journal->j_commit_sequence)); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); + + if (unlikely(is_journal_aborted(journal))) { + printk(KERN_EMERG "journal commit I/O error\n"); + err = -EIO; + } + return err; +} + +/* + * Log buffer allocation routines: + */ + +int journal_next_log_block(journal_t *journal, unsigned long *retp) +{ + unsigned long blocknr; + + spin_lock(&journal->j_state_lock); + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + spin_unlock(&journal->j_state_lock); + return journal_bmap(journal, blocknr, retp); +} + +/* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ +int journal_bmap(journal_t *journal, unsigned long blocknr, + unsigned long *retp) +{ + int err = 0; + unsigned long ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + if (ret) + *retp = ret; + else { + char b[BDEVNAME_SIZE]; + + printk(KERN_ALERT "%s: journal block not found " + "at offset %lu on %s\n", + __FUNCTION__, + blocknr, + bdevname(journal->j_dev, b)); + err = -EIO; + __journal_abort_soft(journal, err); + } + } else { + *retp = blocknr; /* +journal->j_blk_offset */ + } + return err; +} + +/* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + * + * After the caller of journal_get_descriptor_buffer() has finished modifying + * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * But we don't bother doing that, so there will be coherency problems with + * mmaps of blockdevs which hold live JBD-controlled filesystems. + */ +struct journal_head *journal_get_descriptor_buffer(journal_t *journal) +{ + struct buffer_head *bh; + unsigned long blocknr; + int err; + + err = journal_next_log_block(journal, &blocknr); + + if (err) + return NULL; + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + lock_buffer(bh); + memset(bh->b_data, 0, journal->j_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "return this buffer"); + return journal_add_journal_head(bh); +} + +/* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + +/* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + +static journal_t * journal_init_common (void) +{ + journal_t *journal; + int err; + + journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + goto fail; + memset(journal, 0, sizeof(*journal)); + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_logspace); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_checkpoint); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + mutex_init(&journal->j_barrier); + mutex_init(&journal->j_checkpoint_mutex); + spin_lock_init(&journal->j_revoke_lock); + spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_state_lock); + + journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JFS_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + goto fail; + } + return journal; +fail: + return NULL; +} + +/* journal_init_dev and journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + */ + +/** + * journal_t * journal_init_dev() - creates an initialises a journal structure + * @bdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Lenght of the journal in blocks. + * @blocksize: blocksize of journalling device + * @returns: a newly created journal_t * + * + * journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ +journal_t * journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + int start, int len, int blocksize) +{ + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + int n; + + if (!journal) + return NULL; + + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + journal->j_blocksize = blocksize; + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", + __FUNCTION__); + kfree(journal); + journal = NULL; + } + + return journal; +} + +/** + * journal_t * journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ +journal_t * journal_init_inode (struct inode *inode) +{ + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + int err; + int n; + unsigned long blocknr; + + if (!journal) + return NULL; + + journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; + journal->j_inode = inode; + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, inode->i_sb->s_id, inode->i_ino, + (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", + __FUNCTION__); + kfree(journal); + return NULL; + } + + err = journal_bmap(journal, 0, &blocknr); + /* If that failed, give up */ + if (err) { + printk(KERN_ERR "%s: Cannnot locate journal superblock\n", + __FUNCTION__); + kfree(journal); + return NULL; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +} + +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock (journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + +static int journal_reset(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned int first, last; + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* Add the dynamic fields and write it to disk. */ + journal_update_superblock(journal, 1); + journal_start_thread(journal); + return 0; +} + +/** + * int journal_create() - Initialise the new journal file + * @journal: Journal to create. This structure must have been initialised + * + * Given a journal_t structure which tells us which disk blocks we can + * use, create a new journal superblock and initialise all of the + * journal fields from scratch. + **/ +int journal_create(journal_t *journal) +{ + unsigned long blocknr; + struct buffer_head *bh; + journal_superblock_t *sb; + int i, err; + + if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { + printk (KERN_ERR "Journal length (%d blocks) too short.\n", + journal->j_maxlen); + journal_fail_superblock(journal); + return -EINVAL; + } + + if (journal->j_inode == NULL) { + /* + * We don't know what block to start at! + */ + printk(KERN_EMERG + "%s: creation of journal on external device!\n", + __FUNCTION__); + BUG(); + } + + /* Zero out the entire journal on disk. We cannot afford to + have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ + jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); + for (i = 0; i < journal->j_maxlen; i++) { + err = journal_bmap(journal, i, &blocknr); + if (err) + return err; + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + lock_buffer(bh); + memset (bh->b_data, 0, journal->j_blocksize); + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + __brelse(bh); + } + + sync_blockdev(journal->j_dev); + jbd_debug(1, "JBD: journal cleared.\n"); + + /* OK, fill in the initial static fields in the new superblock */ + sb = journal->j_superblock; + + sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + + sb->s_blocksize = cpu_to_be32(journal->j_blocksize); + sb->s_maxlen = cpu_to_be32(journal->j_maxlen); + sb->s_first = cpu_to_be32(1); + + journal->j_transaction_sequence = 1; + + journal->j_flags &= ~JFS_ABORT; + journal->j_format_version = 2; + + return journal_reset(journal); +} + +/** + * void journal_update_superblock() - Update journal sb on disk. + * @journal: The journal to update. + * @wait: Set to '0' if you don't want to wait for IO completion. + * + * Update a journal's dynamic superblock fields and write it to disk, + * optionally waiting for the IO to complete. + */ +void journal_update_superblock(journal_t *journal, int wait) +{ + journal_superblock_t *sb = journal->j_superblock; + struct buffer_head *bh = journal->j_sb_buffer; + + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0) and there are no outstanding transactions + * in the filesystem, then we can safely defer the superblock update + * until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0 && journal->j_tail_sequence == + journal->j_transaction_sequence) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + goto out; + } + + spin_lock(&journal->j_state_lock); + jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, journal->j_errno); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(journal->j_tail); + sb->s_errno = cpu_to_be32(journal->j_errno); + spin_unlock(&journal->j_state_lock); + + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + if (wait) + sync_dirty_buffer(bh); + else + ll_rw_block(SWRITE, 1, &bh); + +out: + /* If we have just flushed the log (by marking s_start==0), then + * any future commit will have to be careful to update the + * superblock again to re-record the true start of the log. */ + + spin_lock(&journal->j_state_lock); + if (sb->s_start) + journal->j_flags &= ~JFS_FLUSHED; + else + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ + +static int journal_get_superblock(journal_t *journal) +{ + struct buffer_head *bh; + journal_superblock_t *sb; + int err = -EIO; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk (KERN_ERR + "JBD: IO error reading journal superblock\n"); + goto out; + } + } + + sb = journal->j_superblock; + + err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + goto out; + } + + switch(be32_to_cpu(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JFS_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + goto out; + } + + if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + printk (KERN_WARNING "JBD: journal file too short\n"); + goto out; + } + + return 0; + +out: + journal_fail_superblock(journal); + return err; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + +static int load_superblock(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_last = be32_to_cpu(sb->s_maxlen); + journal->j_errno = be32_to_cpu(sb->s_errno); + + return 0; +} + + +/** + * int journal_load() - Read journal from disk. + * @journal: Journal to act on. + * + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ +int journal_load(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { + printk (KERN_WARNING + "JBD: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* + * Create a slab for this blocksize + */ + err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize)); + if (err) + return err; + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (journal_recover(journal)) + goto recovery_error; + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JFS_ABORT; + journal->j_flags |= JFS_LOADED; + return 0; + +recovery_error: + printk (KERN_WARNING "JBD: recovery failed\n"); + return -EIO; +} + +/** + * void journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. + * + * Release a journal_t structure once it is no longer in use by the + * journaled object. + */ +void journal_destroy(journal_t *journal) +{ + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + + /* Totally anal locking here... */ + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + log_do_checkpoint(journal); + spin_lock(&journal->j_list_lock); + } + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + spin_unlock(&journal->j_list_lock); + + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = ++journal->j_transaction_sequence; + if (journal->j_sb_buffer) { + journal_update_superblock(journal, 1); + brelse(journal->j_sb_buffer); + } + + if (journal->j_inode) + iput(journal->j_inode); + if (journal->j_revoke) + journal_destroy_revoke(journal); + kfree(journal->j_wbuf); + kfree(journal); +} + + +/** + *int journal_check_used_features () - Check if features specified are used. + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ + +int journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; +} + +/** + * int journal_check_available_features() - Check feature set in journalling layer + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + +int journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + + sb = journal->j_superblock; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && + (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; +} + +/** + * int journal_set_features () - Mark a given journal feature in the superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ + +int journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; +} + + +/** + * int journal_update_format () - Update on-disk journal structure. + * @journal: Journal to act on. + * + * Given an initialised but unloaded journal struct, poke about in the + * on-disk structure to update it to the most recent supported version. + */ +int journal_update_format (journal_t *journal) +{ + journal_superblock_t *sb; + int err; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + switch (be32_to_cpu(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V2: + return 0; + case JFS_SUPERBLOCK_V1: + return journal_convert_superblock_v1(journal, sb); + default: + break; + } + return -EINVAL; +} + +static int journal_convert_superblock_v1(journal_t *journal, + journal_superblock_t *sb) +{ + int offset, blocksize; + struct buffer_head *bh; + + printk(KERN_WARNING + "JBD: Converting superblock from version 1 to 2.\n"); + + /* Pre-initialise new fields to zero */ + offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); + blocksize = be32_to_cpu(sb->s_blocksize); + memset(&sb->s_feature_compat, 0, blocksize-offset); + + sb->s_nr_users = cpu_to_be32(1); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + journal->j_format_version = 2; + + bh = journal->j_sb_buffer; + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + return 0; +} + + +/** + * int journal_flush () - Flush journal + * @journal: Journal to act on. + * + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + +int journal_flush(journal_t *journal) +{ + int err = 0; + transaction_t *transaction = NULL; + unsigned long old_tail; + + spin_lock(&journal->j_state_lock); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) { + tid_t tid = transaction->t_tid; + + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + } else { + spin_unlock(&journal->j_state_lock); + } + + /* ...and flush everything in the log out to disk. */ + spin_lock(&journal->j_list_lock); + while (!err && journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + err = log_do_checkpoint(journal); + spin_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + spin_lock(&journal->j_state_lock); + old_tail = journal->j_tail; + journal->j_tail = 0; + spin_unlock(&journal->j_state_lock); + journal_update_superblock(journal, 1); + spin_lock(&journal->j_state_lock); + journal->j_tail = old_tail; + + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * int journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and journal_load(). + * + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + +int journal_wipe(journal_t *journal, int write) +{ + journal_superblock_t *sb; + int err = 0; + + J_ASSERT (!(journal->j_flags & JFS_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + if (!journal->j_tail) + goto no_recovery; + + printk (KERN_WARNING "JBD: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = journal_skip_recovery(journal); + if (write) + journal_update_superblock(journal, 1); + + no_recovery: + return err; +} + +/* + * journal_dev_name: format a character string to describe on what + * device this journal is present. + */ + +static const char *journal_dev_name(journal_t *journal, char *buffer) +{ + struct block_device *bdev; + + if (journal->j_inode) + bdev = journal->j_inode->i_sb->s_bdev; + else + bdev = journal->j_dev; + + return bdevname(bdev, buffer); +} + +/* + * Journal abort has very specific semantics, which we describe + * for journal abort. + * + * Two internal function, which provide abort to te jbd layer + * itself are here. + */ + +/* + * Quick version for internal journal use (doesn't lock the journal). + * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, + * and don't attempt to make any other journal updates. + */ +void __journal_abort_hard(journal_t *journal) +{ + transaction_t *transaction; + char b[BDEVNAME_SIZE]; + + if (journal->j_flags & JFS_ABORT) + return; + + printk(KERN_ERR "Aborting journal on device %s.\n", + journal_dev_name(journal, b)); + + spin_lock(&journal->j_state_lock); + journal->j_flags |= JFS_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); +} + +/* Soft abort: record the abort error status in the journal superblock, + * but don't do any other IO. */ +static void __journal_abort_soft (journal_t *journal, int errno) +{ + if (journal->j_flags & JFS_ABORT) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + __journal_abort_hard(journal); + + if (errno) + journal_update_superblock(journal, 1); +} + +/** + * void journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final journal_stop, which will receive the -EIO error. + * + * Finally, the journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + * + */ + +void journal_abort(journal_t *journal, int errno) +{ + __journal_abort_soft(journal, errno); +} + +/** + * int journal_errno () - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno numbet set with journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ +int journal_errno(journal_t *journal) +{ + int err; + + spin_lock(&journal->j_state_lock); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + err = journal->j_errno; + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * int journal_clear_err () - clears the journal's error state + * @journal: journal to act on. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ +int journal_clear_err(journal_t *journal) +{ + int err = 0; + + spin_lock(&journal->j_state_lock); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * void journal_ack_err() - Ack journal err. + * @journal: journal to act on. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ +void journal_ack_err(journal_t *journal) +{ + spin_lock(&journal->j_state_lock); + if (journal->j_errno) + journal->j_flags |= JFS_ACK_ERR; + spin_unlock(&journal->j_state_lock); +} + +int journal_blocks_per_page(struct inode *inode) +{ + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); +} + +/* + * Simple support for retrying memory allocations. Introduced to help to + * debug different VM deadlock avoidance strategies. + */ +void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) +{ + return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); +} + +/* + * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed + * and allocate frozen and commit buffers from these slabs. + * + * Reason for doing this is to avoid, SLAB_DEBUG - since it could + * cause bh to cross page boundary. + */ + +#define JBD_MAX_SLABS 5 +#define JBD_SLAB_INDEX(size) (size >> 11) + +static kmem_cache_t *jbd_slab[JBD_MAX_SLABS]; +static const char *jbd_slab_names[JBD_MAX_SLABS] = { + "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k" +}; + +static void journal_destroy_jbd_slabs(void) +{ + int i; + + for (i = 0; i < JBD_MAX_SLABS; i++) { + if (jbd_slab[i]) + kmem_cache_destroy(jbd_slab[i]); + jbd_slab[i] = NULL; + } +} + +static int journal_create_jbd_slab(size_t slab_size) +{ + int i = JBD_SLAB_INDEX(slab_size); + + BUG_ON(i >= JBD_MAX_SLABS); + + /* + * Check if we already have a slab created for this size + */ + if (jbd_slab[i]) + return 0; + + /* + * Create a slab and force alignment to be same as slabsize - + * this will make sure that allocations won't cross the page + * boundary. + */ + jbd_slab[i] = kmem_cache_create(jbd_slab_names[i], + slab_size, slab_size, 0, NULL, NULL); + if (!jbd_slab[i]) { + printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +void * jbd_slab_alloc(size_t size, gfp_t flags) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL); +} + +void jbd_slab_free(void *ptr, size_t size) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + kmem_cache_free(jbd_slab[idx], ptr); +} + +/* + * Journal_head storage management + */ +static kmem_cache_t *journal_head_cache; +#ifdef CONFIG_JBD_DEBUG +static atomic_t nr_journal_heads = ATOMIC_INIT(0); +#endif + +static int journal_init_journal_head_cache(void) +{ + int retval; + + J_ASSERT(journal_head_cache == 0); + journal_head_cache = kmem_cache_create("journal_head", + sizeof(struct journal_head), + 0, /* offset */ + 0, /* flags */ + NULL, /* ctor */ + NULL); /* dtor */ + retval = 0; + if (journal_head_cache == 0) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + } + return retval; +} + +static void journal_destroy_journal_head_cache(void) +{ + J_ASSERT(journal_head_cache != NULL); + kmem_cache_destroy(journal_head_cache); + journal_head_cache = NULL; +} + +/* + * journal_head splicing and dicing + */ +static struct journal_head *journal_alloc_journal_head(void) +{ + struct journal_head *ret; + static unsigned long last_warning; + +#ifdef CONFIG_JBD_DEBUG + atomic_inc(&nr_journal_heads); +#endif + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + if (ret == 0) { + jbd_debug(1, "out of memory for journal_head\n"); + if (time_after(jiffies, last_warning + 5*HZ)) { + printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", + __FUNCTION__); + last_warning = jiffies; + } + while (ret == 0) { + yield(); + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + } + } + return ret; +} + +static void journal_free_journal_head(struct journal_head *jh) +{ +#ifdef CONFIG_JBD_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); +#endif + kmem_cache_free(journal_head_cache, jh); +} + +/* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head may be detached from its buffer_head when the journal_head's + * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. + * Various places in JBD call journal_remove_journal_head() to indicate that the + * journal_head can be dropped if needed. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * journal_put_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = journal_add_journal_head(bh); + * ... + * jh->b_transaction = xxx; + * journal_put_journal_head(jh); + * + * Now, the journal_head's b_jcount is zero, but it is safe from being released + * because it has a non-zero b_transaction. + */ + +/* + * Give a buffer_head a journal_head. + * + * Doesn't need the journal lock. + * May sleep. + */ +struct journal_head *journal_add_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh; + struct journal_head *new_jh = NULL; + +repeat: + if (!buffer_jbd(bh)) { + new_jh = journal_alloc_journal_head(); + memset(new_jh, 0, sizeof(*new_jh)); + } + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + + if (!new_jh) { + jbd_unlock_bh_journal_head(bh); + goto repeat; + } + + jh = new_jh; + new_jh = NULL; /* We consumed it */ + set_buffer_jbd(bh); + bh->b_private = jh; + jh->b_bh = bh; + get_bh(bh); + BUFFER_TRACE(bh, "added journal_head"); + } + jh->b_jcount++; + jbd_unlock_bh_journal_head(bh); + if (new_jh) + journal_free_journal_head(new_jh); + return bh->b_private; +} + +/* + * Grab a ref against this buffer_head's journal_head. If it ended up not + * having a journal_head, return NULL + */ +struct journal_head *journal_grab_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = NULL; + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + jh->b_jcount++; + } + jbd_unlock_bh_journal_head(bh); + return jh; +} + +static void __journal_remove_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + + J_ASSERT_JH(jh, jh->b_jcount >= 0); + + get_bh(bh); + if (jh->b_jcount == 0) { + if (jh->b_transaction == NULL && + jh->b_next_transaction == NULL && + jh->b_cp_transaction == NULL) { + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing " + "b_frozen_data\n", + __FUNCTION__); + jbd_slab_free(jh->b_frozen_data, bh->b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing " + "b_committed_data\n", + __FUNCTION__); + jbd_slab_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + __brelse(bh); + journal_free_journal_head(jh); + } else { + BUFFER_TRACE(bh, "journal_head was locked"); + } + } +} + +/* + * journal_remove_journal_head(): if the buffer isn't attached to a transaction + * and has a zero b_jcount then remove and release its journal_head. If we did + * see that the buffer is not used by any transaction we also "logically" + * decrement ->b_count. + * + * We in fact take an additional increment on ->b_count as a convenience, + * because the caller usually wants to do additional things with the bh + * after calling here. + * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some + * time. Once the caller has run __brelse(), the buffer is eligible for + * reaping by try_to_free_buffers(). + */ +void journal_remove_journal_head(struct buffer_head *bh) +{ + jbd_lock_bh_journal_head(bh); + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); +} + +/* + * Drop a reference on the passed journal_head. If it fell to zero then try to + * release the journal_head from the buffer_head. + */ +void journal_put_journal_head(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_journal_head(bh); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount && !jh->b_transaction) { + __journal_remove_journal_head(bh); + __brelse(bh); + } + jbd_unlock_bh_journal_head(bh); +} + +/* + * /proc tunables + */ +#if defined(CONFIG_JBD_DEBUG) +int journal_enable_debug; +EXPORT_SYMBOL(journal_enable_debug); +#endif + +#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) + +static struct proc_dir_entry *proc_jbd_debug; + +static int read_jbd_debug(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int ret; + + ret = sprintf(page + off, "%d\n", journal_enable_debug); + *eof = 1; + return ret; +} + +static int write_jbd_debug(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char buf[32]; + + if (count > ARRAY_SIZE(buf) - 1) + count = ARRAY_SIZE(buf) - 1; + if (copy_from_user(buf, buffer, count)) + return -EFAULT; + buf[ARRAY_SIZE(buf) - 1] = '\0'; + journal_enable_debug = simple_strtoul(buf, NULL, 10); + return count; +} + +#define JBD_PROC_NAME "sys/fs/jbd-debug" + +static void __init create_jbd_proc_entry(void) +{ + proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); + if (proc_jbd_debug) { + /* Why is this so hard? */ + proc_jbd_debug->read_proc = read_jbd_debug; + proc_jbd_debug->write_proc = write_jbd_debug; + } +} + +static void __exit remove_jbd_proc_entry(void) +{ + if (proc_jbd_debug) + remove_proc_entry(JBD_PROC_NAME, NULL); +} + +#else + +#define create_jbd_proc_entry() do {} while (0) +#define remove_jbd_proc_entry() do {} while (0) + +#endif + +kmem_cache_t *jbd_handle_cache; + +static int __init journal_init_handle_cache(void) +{ + jbd_handle_cache = kmem_cache_create("journal_handle", + sizeof(handle_t), + 0, /* offset */ + 0, /* flags */ + NULL, /* ctor */ + NULL); /* dtor */ + if (jbd_handle_cache == NULL) { + printk(KERN_EMERG "JBD: failed to create handle cache\n"); + return -ENOMEM; + } + return 0; +} + +static void journal_destroy_handle_cache(void) +{ + if (jbd_handle_cache) + kmem_cache_destroy(jbd_handle_cache); +} + +/* + * Module startup and shutdown + */ + +static int __init journal_init_caches(void) +{ + int ret; + + ret = journal_init_revoke_caches(); + if (ret == 0) + ret = journal_init_journal_head_cache(); + if (ret == 0) + ret = journal_init_handle_cache(); + return ret; +} + +static void journal_destroy_caches(void) +{ + journal_destroy_revoke_caches(); + journal_destroy_journal_head_cache(); + journal_destroy_handle_cache(); + journal_destroy_jbd_slabs(); +} + +static int __init journal_init(void) +{ + int ret; + +/* Static check for data structure consistency. There's no code + * invoked --- we'll just get a linker failure if things aren't right. + */ + extern void journal_bad_superblock_size(void); + if (sizeof(struct journal_superblock_s) != 1024) + journal_bad_superblock_size(); + + + ret = journal_init_caches(); + if (ret != 0) + journal_destroy_caches(); + create_jbd_proc_entry(); + return ret; +} + +static void __exit journal_exit(void) +{ +#ifdef CONFIG_JBD_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); +#endif + remove_jbd_proc_entry(); + journal_destroy_caches(); +} + +MODULE_LICENSE("GPL"); +module_init(journal_init); +module_exit(journal_exit); + diff -puN /dev/null fs/jbd2/Makefile --- /dev/null 2006-09-07 07:04:09.114355360 -0700 +++ linux-2.6.18-rc6-ming/fs/jbd2/Makefile 2006-09-07 23:05:04.845142802 -0700 @@ -0,0 +1,7 @@ +# +# Makefile for the linux journaling routines. +# + +obj-$(CONFIG_JBD) += jbd.o + +jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff -puN /dev/null fs/jbd2/recovery.c --- /dev/null 2006-09-07 07:04:09.114355360 -0700 +++ linux-2.6.18-rc6-ming/fs/jbd2/recovery.c 2006-09-07 23:05:04.848142434 -0700 @@ -0,0 +1,592 @@ +/* + * linux/fs/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#endif + +/* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ +struct recovery_info +{ + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; +}; + +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); +static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + +#ifdef __KERNEL__ + +/* Release readahead buffers after use */ +void journal_brelse_array(struct buffer_head *b[], int n) +{ + while (--n >= 0) + brelse (b[n]); +} + + +/* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + +#define MAXBUF 8 +static int do_readahead(journal_t *journal, unsigned int start) +{ + int err; + unsigned int max, nbufs, next; + unsigned long blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + err = journal_bmap(journal, next, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + next); + goto failed; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + +failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; +} + +#endif /* __KERNEL__ */ + + +/* + * Read a block from the journal + */ + +static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) +{ + int err; + unsigned long blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + if (offset >= journal->j_maxlen) { + printk(KERN_ERR "JBD: corrupted journal superblock\n"); + return -EIO; + } + + err = journal_bmap(journal, offset, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + offset); + return err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; +} + + +/* + * Count the number of in-use tags in a journal descriptor block. + */ + +static int count_tags(struct buffer_head *bh, int size) +{ + char * tagp; + journal_block_tag_t * tag; + int nr = 0; + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += sizeof(journal_block_tag_t); + if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) + break; + } + + return nr; +} + + +/* Make sure we wrap around the log correctly! */ +#define wrap(journal, var) \ +do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ +} while (0) + +/** + * journal_recover - recovers a on-disk journal + * @journal: the journal to recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ +int journal_recover(journal_t *journal) +{ + int err; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + be32_to_cpu(sb->s_sequence)); + journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + return 0; + } + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(0, "JBD: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + journal_clear_revoke(journal); + sync_blockdev(journal->j_fs_dev); + return err; +} + +/** + * journal_skip_recovery - Start journal and wipe exiting records + * @journal: journal to startup + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * This function does'nt appear to be exorted.. + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ +int journal_skip_recovery(journal_t *journal) +{ + int err; + journal_superblock_t * sb; + + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + sb = journal->j_superblock; + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { +#ifdef CONFIG_JBD_DEBUG + int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); +#endif + jbd_debug(0, + "JBD: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + return err; +} + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int first_commit_ID, next_commit_ID; + unsigned long next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + + /* Precompute the maximum metadata descriptors in a descriptor block */ + int MAX_BLOCKS_PER_DESC; + MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) + / sizeof(journal_block_tag_t)); + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = be32_to_cpu(sb->s_sequence); + next_log_block = be32_to_cpu(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + cond_resched(); /* We're under lock_kernel() */ + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = be32_to_cpu(tmp->h_blocktype); + sequence = be32_to_cpu(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it + * in pass REPLAY; otherwise, just skip over the + * blocks it describes. */ + if (pass != PASS_REPLAY) { + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); + brelse(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) + <= journal->j_blocksize) { + unsigned long io_block; + + tag = (journal_block_tag_t *) tagp; + flags = be32_to_cpu(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk (KERN_ERR + "JBD: IO error %d recovering " + "block %ld in log\n", + err, io_block); + } else { + unsigned long blocknr; + + J_ASSERT(obh != NULL); + blocknr = be32_to_cpu(tag->t_blocknr); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = __getblk(journal->j_fs_dev, + blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JFS_FLAG_ESCAPE) { + *((__be32 *)bh->b_data) = + cpu_to_be32(JFS_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += sizeof(journal_block_tag_t); + if (!(flags & JFS_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JFS_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JFS_COMMIT_BLOCK: + /* Found an expected commit block: not much to + * do other than move on to the next sequence + * number. */ + brelse(bh); + next_commit_ID++; + continue; + + case JFS_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + brelse(bh); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) + info->end_transaction = next_commit_ID; + else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk (KERN_ERR "JBD: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + return success; + + failed: + return err; +} + + +/* Scan a revoke record, marking all blocks mentioned as revoked. */ + +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) +{ + journal_revoke_header_t *header; + int offset, max; + + header = (journal_revoke_header_t *) bh->b_data; + offset = sizeof(journal_revoke_header_t); + max = be32_to_cpu(header->r_count); + + while (offset < max) { + unsigned long blocknr; + int err; + + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + offset += 4; + err = journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; +} diff -puN /dev/null fs/jbd2/revoke.c --- /dev/null 2006-09-07 07:04:09.114355360 -0700 +++ linux-2.6.18-rc6-ming/fs/jbd2/revoke.c 2006-09-07 23:05:04.851142066 -0700 @@ -0,0 +1,703 @@ +/* + * linux/fs/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +static kmem_cache_t *revoke_record_cache; +static kmem_cache_t *revoke_table_cache; + +/* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + +struct jbd_revoke_record_s +{ + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned long blocknr; +}; + + +/* The revoke table is just a simple hash table of revoke records. */ +struct jbd_revoke_table_s +{ + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; +}; + + +#ifdef __KERNEL__ +static void write_one_revoke_record(journal_t *, transaction_t *, + struct journal_head **, int *, + struct jbd_revoke_record_s *); +static void flush_descriptor(journal_t *, struct journal_head *, int); +#endif + +/* Utility functions to maintain the revoke table */ + +/* Borrowed from buffer.c: this is a tried and tested block hash function */ +static inline int hash(journal_t *journal, unsigned long block) +{ + struct jbd_revoke_table_s *table = journal->j_revoke; + int hash_shift = table->hash_shift; + + return ((block << (hash_shift - 6)) ^ + (block >> 13) ^ + (block << (hash_shift - 12))) & (table->hash_size - 1); +} + +static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, + tid_t seq) +{ + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + +repeat: + record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + spin_lock(&journal->j_revoke_lock); + list_add(&record->hash, hash_list); + spin_unlock(&journal->j_revoke_lock); + return 0; + +oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); + yield(); + goto repeat; +} + +/* Find a revoke record in the journal's hash table. */ + +static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned long blocknr) +{ + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + spin_lock(&journal->j_revoke_lock); + record = (struct jbd_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) { + spin_unlock(&journal->j_revoke_lock); + return record; + } + record = (struct jbd_revoke_record_s *) record->hash.next; + } + spin_unlock(&journal->j_revoke_lock); + return NULL; +} + +int __init journal_init_revoke_caches(void) +{ + revoke_record_cache = kmem_cache_create("revoke_record", + sizeof(struct jbd_revoke_record_s), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (revoke_record_cache == 0) + return -ENOMEM; + + revoke_table_cache = kmem_cache_create("revoke_table", + sizeof(struct jbd_revoke_table_s), + 0, 0, NULL, NULL); + if (revoke_table_cache == 0) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + return -ENOMEM; + } + return 0; +} + +void journal_destroy_revoke_caches(void) +{ + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = NULL; +} + +/* Initialise the revoke table for a given journal to a given size. */ + +int journal_init_revoke(journal_t *journal, int hash_size) +{ + int shift, tmp; + + J_ASSERT (journal->j_revoke_table[0] == NULL); + + shift = 0; + tmp = hash_size; + while((tmp >>= 1UL) != 0UL) + shift++; + + journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!journal->j_revoke_table[0]) + return -ENOMEM; + journal->j_revoke = journal->j_revoke_table[0]; + + /* Check that the hash_size is a power of two */ + J_ASSERT ((hash_size & (hash_size-1)) == 0); + + journal->j_revoke->hash_size = hash_size; + + journal->j_revoke->hash_shift = shift; + + journal->j_revoke->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!journal->j_revoke->hash_table) { + kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); + journal->j_revoke = NULL; + return -ENOMEM; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + + journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!journal->j_revoke_table[1]) { + kfree(journal->j_revoke_table[0]->hash_table); + kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); + return -ENOMEM; + } + + journal->j_revoke = journal->j_revoke_table[1]; + + /* Check that the hash_size is a power of two */ + J_ASSERT ((hash_size & (hash_size-1)) == 0); + + journal->j_revoke->hash_size = hash_size; + + journal->j_revoke->hash_shi