combined patch to support 48 bit ext3dev filesystem based on extents --- linux-2.6.17-git13-ming/fs/ext3/Makefile | 2 linux-2.6.17-git13-ming/fs/ext3/balloc.c | 74 linux-2.6.17-git13-ming/fs/ext3/dir.c | 3 linux-2.6.17-git13-ming/fs/ext3/extents.c | 2119 ++++++++++++++++ linux-2.6.17-git13-ming/fs/ext3/ialloc.c | 30 linux-2.6.17-git13-ming/fs/ext3/inode.c | 34 linux-2.6.17-git13-ming/fs/ext3/ioctl.c | 1 linux-2.6.17-git13-ming/fs/ext3/resize.c | 39 linux-2.6.17-git13-ming/fs/ext3/super.c | 68 linux-2.6.17-git13-ming/fs/jbd/commit.c | 19 linux-2.6.17-git13-ming/fs/jbd/journal.c | 27 linux-2.6.17-git13-ming/fs/jbd/recovery.c | 51 linux-2.6.17-git13-ming/fs/jbd/revoke.c | 38 linux-2.6.17-git13-ming/include/asm-h8300/types.h | 1 linux-2.6.17-git13-ming/include/asm-i386/types.h | 1 linux-2.6.17-git13-ming/include/asm-mips/types.h | 5 linux-2.6.17-git13-ming/include/asm-powerpc/types.h | 5 linux-2.6.17-git13-ming/include/asm-s390/types.h | 5 linux-2.6.17-git13-ming/include/asm-sh/types.h | 1 linux-2.6.17-git13-ming/include/asm-x86_64/types.h | 1 linux-2.6.17-git13-ming/include/linux/ext3_fs.h | 153 + linux-2.6.17-git13-ming/include/linux/ext3_fs_extents.h | 198 + linux-2.6.17-git13-ming/include/linux/ext3_fs_i.h | 17 linux-2.6.17-git13-ming/include/linux/ext3_fs_sb.h | 10 linux-2.6.17-git13-ming/include/linux/ext3_jbd.h | 19 linux-2.6.17-git13-ming/include/linux/jbd.h | 31 linux-2.6.17-git13-ming/include/linux/types.h | 1 27 files changed, 2783 insertions(+), 170 deletions(-) diff -puN fs/ext3/balloc.c~ext3dev-2.6.17-git13 fs/ext3/balloc.c --- linux-2.6.17-git13/fs/ext3/balloc.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.925868419 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/balloc.c 2006-06-28 17:45:21.064852472 -0700 @@ -38,7 +38,6 @@ #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh) @@ -89,12 +88,16 @@ read_block_bitmap(struct super_block *sb desc = ext3_get_group_desc (sb, block_group, NULL); if (!desc) goto error_out; - bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap)); + bh = sb_bread(sb, + EXT3_BLOCK_BITMAP(desc, + ext3_group_first_block_no(sb, block_group))); if (!bh) ext3_error (sb, "read_block_bitmap", "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %u", - block_group, le32_to_cpu(desc->bg_block_bitmap)); + "block_group = %d, block_bitmap = "E3FSBLK, + block_group, + EXT3_BLOCK_BITMAP(desc, + ext3_group_first_block_no(sb, block_group))); error_out: return bh; } @@ -329,7 +332,7 @@ void ext3_free_blocks_sb(handle_t *handl es = sbi->s_es; if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || - block + count > le32_to_cpu(es->s_blocks_count)) { + block + count > EXT3_BLOCKS_COUNT(es)) { ext3_error (sb, "ext3_free_blocks", "Freeing blocks not in datazone - " "block = "E3FSBLK", count = %lu", block, count); @@ -340,10 +343,7 @@ void ext3_free_blocks_sb(handle_t *handl do_more: overflow = 0; - block_group = (block - le32_to_cpu(es->s_first_data_block)) / - EXT3_BLOCKS_PER_GROUP(sb); - bit = (block - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb); + ext3_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. @@ -360,11 +360,19 @@ do_more: if (!desc) goto error_return; - if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || - in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || - in_range (block, le32_to_cpu(desc->bg_inode_table), + if (in_range (EXT3_BLOCK_BITMAP(desc, + ext3_group_first_block_no(sb, block_group)), + block, count) || + in_range (EXT3_INODE_BITMAP(desc, + ext3_group_first_block_no(sb, block_group)), + block, count) || + in_range (block, + EXT3_INODE_TABLE(desc, + ext3_group_first_block_no(sb, block_group)), sbi->s_itb_per_group) || - in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + in_range (block + count - 1, + EXT3_INODE_TABLE(desc, + ext3_group_first_block_no(sb, block_group)), sbi->s_itb_per_group)) ext3_error (sb, "ext3_free_blocks", "Freeing blocks in system zones - " @@ -1167,7 +1175,7 @@ static int ext3_has_free_blocks(struct e ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + root_blocks = EXT3_R_BLOCKS_COUNT(sbi->s_es); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { @@ -1205,7 +1213,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gdp_bh; - int group_no; + unsigned long group_no; int goal_group; ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */ ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ @@ -1266,10 +1274,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h * First, test whether the goal block is free. */ if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= le32_to_cpu(es->s_blocks_count)) + goal >= EXT3_BLOCKS_COUNT(es)) goal = le32_to_cpu(es->s_first_data_block); - group_no = (goal - le32_to_cpu(es->s_first_data_block)) / - EXT3_BLOCKS_PER_GROUP(sb); + ext3_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk); gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); if (!gdp) goto io_error; @@ -1286,8 +1293,6 @@ retry: my_rsv = NULL; if (free_blocks > 0) { - grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb)); bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; @@ -1368,11 +1373,15 @@ allocated: ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no); - if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || - in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || - in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), + if (in_range(EXT3_BLOCK_BITMAP(gdp, ext3_group_first_block_no(sb, group_no)), + ret_block, num) || + in_range(EXT3_BLOCK_BITMAP(gdp, ext3_group_first_block_no(sb, group_no)), + ret_block, num) || + in_range(ret_block, EXT3_INODE_TABLE(gdp, + ext3_group_first_block_no(sb, group_no)), EXT3_SB(sb)->s_itb_per_group) || - in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + in_range(ret_block + num - 1, EXT3_INODE_TABLE(gdp, + ext3_group_first_block_no(sb, group_no)), EXT3_SB(sb)->s_itb_per_group)) ext3_error(sb, "ext3_new_block", "Allocating block in system zone - " @@ -1411,11 +1420,11 @@ allocated: jbd_unlock_bh_state(bitmap_bh); #endif - if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + if (ret_block + num - 1 >= EXT3_BLOCKS_COUNT(es)) { ext3_error(sb, "ext3_new_block", - "block("E3FSBLK") >= blocks count(%d) - " - "block_group = %d, es == %p ", ret_block, - le32_to_cpu(es->s_blocks_count), group_no, es); + "block("E3FSBLK") >= blocks count("E3FSBLK") - " + "block_group = %lu, es == %p ", ret_block, + EXT3_BLOCKS_COUNT(es), group_no, es); goto out; } @@ -1508,7 +1517,7 @@ ext3_fsblk_t ext3_count_free_blocks(stru brelse(bitmap_bh); printk("ext3_count_free_blocks: stored = "E3FSBLK ", computed = "E3FSBLK", "E3FSBLK"\n", - le32_to_cpu(es->s_free_blocks_count), + EXT3_FREE_BLOCKS_COUNT(es), desc_count, bitmap_count); return bitmap_count; #else @@ -1528,9 +1537,10 @@ ext3_fsblk_t ext3_count_free_blocks(stru static inline int block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map) { - return ext3_test_bit ((block - - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb), map); + ext3_grpblk_t offset; + + ext3_get_group_no_and_offset(sb, block, NULL, &offset); + return ext3_test_bit (offset, map); } static inline int test_root(int a, int b) diff -puN fs/ext3/dir.c~ext3dev-2.6.17-git13 fs/ext3/dir.c --- linux-2.6.17-git13/fs/ext3/dir.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.928868075 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/dir.c 2006-06-28 17:45:21.065852357 -0700 @@ -131,8 +131,7 @@ static int ext3_readdir(struct file * fi struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); if (err > 0) { page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, &filp->f_ra, diff -puN /dev/null fs/ext3/extents.c --- /dev/null 2006-06-28 00:02:13.345547960 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/extents.c 2006-06-28 17:45:21.073851439 -0700 @@ -0,0 +1,2119 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT3 + * + * TODO: + * - ext3*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* this macro combines low and hi parts of phys. blocknr into ext3_fsblk_t */ +static inline ext3_fsblk_t ext_pblock(struct ext3_extent *ex) +{ + ext3_fsblk_t block; + + block = le32_to_cpu(ex->ee_start); + if (sizeof(ext3_fsblk_t) > 4) + block |= ((ext3_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* this macro combines low and hi parts of phys. blocknr into ext3_fsblk_t */ +static inline ext3_fsblk_t idx_pblock(struct ext3_extent_idx *ix) +{ + ext3_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf); + if (sizeof(ext3_fsblk_t) > 4) + block |= ((ext3_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* the routine stores large phys. blocknr into extent breaking it into parts */ +static inline void ext3_ext_store_pblock(struct ext3_extent *ex, ext3_fsblk_t pb) +{ + ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + if (sizeof(ext3_fsblk_t) > 4) + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); +} + +/* the routine stores large phys. blocknr into index breaking it into parts */ +static inline void ext3_idx_store_pblock(struct ext3_extent_idx *ix, ext3_fsblk_t pb) +{ + ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + if (sizeof(ext3_fsblk_t) > 4) + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); +} + +static int ext3_ext_check_header(const char *function, struct inode *inode, + struct ext3_extent_header *eh) +{ + const char *error_msg = NULL; + + if (unlikely(eh->eh_magic != EXT3_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + return 0; + +corrupted: + ext3_error(inode->i_sb, function, + "bad header in inode #%lu: %s - magic %x, " + "entries %u, max %u, depth %u", + inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + le16_to_cpu(eh->eh_depth)); + + return -EIO; +} + +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; + + if (handle->h_buffer_credits > needed) + return handle; + if (!ext3_journal_extend(handle, needed)) + return handle; + err = ext3_journal_restart(handle, needed); + + return handle; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext3_ext_get_access(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext3_journal_get_write_access(handle, path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int ext3_ext_dirty(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + int err; + if (path->p_bh) { + /* path points to block */ + err = ext3_journal_dirty_metadata(handle, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext3_mark_inode_dirty(handle, inode); + } + return err; +} + +static ext3_fsblk_t ext3_ext_find_goal(struct inode *inode, + struct ext3_ext_path *path, + ext3_fsblk_t block) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + ext3_fsblk_t bg_start; + ext3_grpblk_t colour; + int depth; + + if (path) { + struct ext3_extent *ex; + depth = path->p_depth; + + /* try to predict block placement */ + if ((ex = path[depth].p_ext)) + return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); + + /* it looks index is empty + * try to find starting from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour + block; +} + +static ext3_fsblk_t +ext3_ext_new_block(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *ex, int *err) +{ + ext3_fsblk_t goal, newblock; + + goal = ext3_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext3_new_block(handle, inode, goal, err); + return newblock; +} + +static inline int ext3_ext_space_block(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + if (size > 6) + size = 6; +#endif + return size; +} + +static inline int ext3_ext_space_block_idx(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + if (size > 5) + size = 5; +#endif + return size; +} + +static inline int ext3_ext_space_root(struct inode *inode) +{ + int size; + + size = sizeof(EXT3_I(inode)->i_data); + size -= sizeof(struct ext3_extent_header); + size /= sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + if (size > 3) + size = 3; +#endif + return size; +} + +static inline int ext3_ext_space_root_idx(struct inode *inode) +{ + int size; + + size = sizeof(EXT3_I(inode)->i_data); + size -= sizeof(struct ext3_extent_header); + size /= sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + if (size > 4) + size = 4; +#endif + return size; +} + +#ifdef EXT_DEBUG +static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug("path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(" %d->"E3FSBLK, le32_to_cpu(path->p_idx->ei_block), + idx_pblock(path->p_idx)); + } else if (path->p_ext) { + ext_debug(" %d:%d:"E3FSBLK" ", + le32_to_cpu(path->p_ext->ee_block), + le16_to_cpu(path->p_ext->ee_len), + ext_pblock(path->p_ext)); + } else + ext_debug(" []"); + } + ext_debug("\n"); +} + +static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext3_extent_header *eh; + struct ext3_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug("%d:%d:"E3FSBLK" ", le32_to_cpu(ex->ee_block), + le16_to_cpu(ex->ee_len), ext_pblock(ex)); + } + ext_debug("\n"); +} +#else +#define ext3_ext_show_path(inode,path) +#define ext3_ext_show_leaf(inode,path) +#endif + +static void ext3_ext_drop_refs(struct ext3_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +/* + * binary search for closest index by given block + */ +static void +ext3_ext_binsearch_idx(struct inode *inode, struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent_idx *r, *l, *m; + + BUG_ON(eh->eh_magic != EXT3_EXT_MAGIC); + BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); + BUG_ON(le16_to_cpu(eh->eh_entries) <= 0); + + ext_debug("binsearch for %d(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1; + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block, + m, m->ei_block, r, r->ei_block); + } + + path->p_idx = l - 1; + ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + idx_block(path->p_idx)); + +#ifdef CHECK_BINSEARCH + { + struct ext3_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && + le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + printk("k=%d, ix=0x%p, first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk("%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * binary search for closest extent by given block + */ +static void +ext3_ext_binsearch(struct inode *inode, struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent *r, *l, *m; + + BUG_ON(eh->eh_magic != EXT3_EXT_MAGIC); + BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); + + if (eh->eh_entries == 0) { + /* + * this leaf is empty yet: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug("binsearch for %d: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1; + + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block, + m, m->ee_block, r, r->ee_block); + } + + path->p_ext = l - 1; + ext_debug(" -> %d:"E3FSBLK":%d ", + le32_to_cpu(path->p_ext->ee_block), + ext_pblock(path->p_ext), + le16_to_cpu(path->p_ext->ee_len)); + +#ifdef CHECK_BINSEARCH + { + struct ext3_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +int ext3_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext3_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT3_EXT_MAGIC; + eh->eh_max = cpu_to_le16(ext3_ext_space_root(inode)); + ext3_mark_inode_dirty(handle, inode); + ext3_ext_invalidate_cache(inode); + return 0; +} + +struct ext3_ext_path * +ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + struct buffer_head *bh; + short int depth, i, ppos = 0, alloc = 0; + + eh = ext_inode_hdr(inode); + BUG_ON(eh == NULL); + if (ext3_ext_check_header(__FUNCTION__, inode, eh)) + return ERR_PTR(-EIO); + + i = depth = ext_depth(inode); + + /* account possible depth increase */ + if (!path) { + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + alloc = 1; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + path[0].p_hdr = eh; + + /* walk through the tree */ + while (i) { + ext_debug("depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + ext3_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = idx_pblock(path[ppos].p_idx); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_bread(inode->i_sb, path[ppos].p_block); + if (!bh) + goto err; + + eh = ext_block_hdr(bh); + ppos++; + BUG_ON(ppos > depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; + + if (ext3_ext_check_header(__FUNCTION__, inode, eh)) + goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + if (ext3_ext_check_header(__FUNCTION__, inode, eh)) + goto err; + + /* find extent */ + ext3_ext_binsearch(inode, path + ppos, block); + + ext3_ext_show_path(inode, path); + + return path; + +err: + ext3_ext_drop_refs(path); + if (alloc) + kfree(path); + return ERR_PTR(-EIO); +} + +/* + * insert new index [logical;ptr] into the block at cupr + * it check where to insert: before curp or after curp + */ +static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext3_ext_path *curp, + int logical, ext3_fsblk_t ptr) +{ + struct ext3_extent_idx *ix; + int len, err; + + if ((err = ext3_ext_get_access(handle, inode, curp))) + return err; + + BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { + len = (len - 1) * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug("insert new index %d after: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + (curp->p_idx + 1), (curp->p_idx + 2)); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + } + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug("insert new index %d before: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->ei_block = cpu_to_le32(logical); + ext3_idx_store_pblock(ix, ptr); + curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1); + + BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max)); + BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); + + err = ext3_ext_dirty(handle, inode, curp); + ext3_std_error(inode->i_sb, err); + + return err; +} + +/* + * routine inserts new subtree into the path, using free index entry + * at depth 'at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extens and index entries (right to the split point) + * into the newly allocated blocks + * - initialize subtree + */ +static int ext3_ext_split(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct ext3_extent *ex; + int i = at, k, m, a; + ext3_fsblk_t newblock, oldblock; + __le32 border; + ext3_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now desicion is simplest: at current extent */ + + /* if current leaf will be splitted, then we should use + * border from split point */ + BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug("leaf will be splitted." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug("leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * if error occurs, then we break processing + * and turn filesystem read-only. so, index won't + * be inserted and tree will be in consistent + * state. next mount will repair buffers too + */ + + /* + * get array to track all allocated blocks + * we need this to handle errors and free blocks + * upon them + */ + ablocks = kmalloc(sizeof(ext3_fsblk_t) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + memset(ablocks, 0, sizeof(ext3_fsblk_t) * depth); + + /* allocate all needed blocks */ + ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext3_ext_new_block(handle, inode, path, newext, &err); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + BUG_ON(newblock == 0); + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext3_ext_space_block(inode)); + neh->eh_magic = EXT3_EXT_MAGIC; + neh->eh_depth = 0; + ex = EXT_FIRST_EXTENT(neh); + + /* move remain of path[depth] to the new leaf */ + BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); + /* start copy from next extent */ + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:"E3FSBLK":%d in new leaf "E3FSBLK"\n", + le32_to_cpu(path[depth].p_ext->ee_block), + ext_pblock(path[depth].p_ext), + le16_to_cpu(path[depth].p_ext->ee_len), + newblock); + /*memmove(ex++, path[depth].p_ext++, + sizeof(struct ext3_extent)); + neh->eh_entries++;*/ + path[depth].p_ext++; + m++; + } + if (m) { + memmove(ex, path[depth].p_ext-m, sizeof(struct ext3_extent)*m); + neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m); + } + + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + goto cleanup; + path[depth].p_hdr->eh_entries = + cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m); + if ((err = ext3_ext_dirty(handle, inode, path + depth))) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + BUG_ON(k < 0); + if (k) + ext_debug("create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, (ext3_fsblk_t)newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = EXT3_EXT_MAGIC; + neh->eh_max = cpu_to_le16(ext3_ext_space_block_idx(inode)); + neh->eh_depth = cpu_to_le16(depth - i); + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + ext3_idx_store_pblock(fidx, oldblock); + + ext_debug("int.index at %d (block "E3FSBLK"): %lu -> "E3FSBLK"\n", i, + newblock, (unsigned long) le32_to_cpu(border), + oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; + + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug("%d: move %d:%d in new index "E3FSBLK"\n", i, + le32_to_cpu(path[i].p_idx->ei_block), + idx_pblock(path[i].p_idx), + newblock); + /*memmove(++fidx, path[i].p_idx++, + sizeof(struct ext3_extent_idx)); + neh->eh_entries++; + BUG_ON(neh->eh_entries > neh->eh_max);*/ + path[i].p_idx++; + m++; + } + if (m) { + memmove(++fidx, path[i].p_idx - m, + sizeof(struct ext3_extent_idx) * m); + neh->eh_entries = + cpu_to_le16(le16_to_cpu(neh->eh_entries) + m); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext3_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m); + err = ext3_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + if (err) + goto cleanup; + + err = ext3_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext3_free_blocks(handle, inode, ablocks[i], 1); + } + } + kfree(ablocks); + + return err; +} + +/* + * routine implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initialize new top-level, creating index that points to the + * just created block + */ +static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_ext_path *curp = path; + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct buffer_head *bh; + ext3_fsblk_t newblock; + int err = 0; + + newblock = ext3_ext_new_block(handle, inode, path, newext, &err); + if (newblock == 0) + return err; + + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext3_std_error(inode->i_sb, err); + return err; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, curp->p_hdr, sizeof(EXT3_I(inode)->i_data)); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext3_ext_space_block_idx(inode)); + else + neh->eh_max = cpu_to_le16(ext3_ext_space_block(inode)); + neh->eh_magic = EXT3_EXT_MAGIC; + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto out; + + /* create index in new top-level index: num,max,pointer */ + if ((err = ext3_ext_get_access(handle, inode, curp))) + goto out; + + curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; + curp->p_hdr->eh_max = cpu_to_le16(ext3_ext_space_root_idx(inode)); + curp->p_hdr->eh_entries = cpu_to_le16(1); + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + /* FIXME: it works, but actually path[0] can be index */ + curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + ext3_idx_store_pblock(curp->p_idx, newblock); + + neh = ext_inode_hdr(inode); + fidx = EXT_FIRST_INDEX(neh); + ext_debug("new root: num %d(%d), lblock %d, ptr "E3FSBLK"\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); + + neh->eh_depth = cpu_to_le16(path->p_depth + 1); + err = ext3_ext_dirty(handle, inode, curp); +out: + brelse(bh); + + return err; +} + +/* + * routine finds empty index and adds new leaf. if no free index found + * then it requests in-depth growing + */ +static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block + * so, subsequent data blocks should be contigoues */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext3_ext_split(handle, inode, path, newext, i); + + /* refill path */ + ext3_ext_drop_refs(path); + path = ext3_ext_find_extent(inode, + le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext3_ext_grow_indepth(handle, inode, path, newext); + if (err) + goto out; + + /* refill path */ + ext3_ext_drop_refs(path); + path = ext3_ext_find_extent(inode, + le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + /* + * only first (depth 0 -> 1) produces free space + * in all other cases we have to split growed tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need split */ + goto repeat; + } + } + +out: + return err; +} + +/* + * returns allocated block in subsequent extent or EXT_MAX_BLOCK + * NOTE: it consider block number from index entry as + * allocated block. thus, index entries have to be consistent + * with leafs + */ +static unsigned long +ext3_ext_next_allocated_block(struct ext3_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCK; + + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_ext[1].ee_block); + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + } + depth--; + } + + return EXT_MAX_BLOCK; +} + +/* + * returns first allocated block from next leaf or EXT_MAX_BLOCK + */ +static unsigned ext3_ext_next_leaf_block(struct inode *inode, + struct ext3_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCK; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_MAX_BLOCK; +} + +/* + * if leaf gets modified and modified extent is first in the leaf + * then we have to correct all indexes above + * TODO: do we need to correct tree in all cases? + */ +int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + int depth = ext_depth(inode); + struct ext3_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + BUG_ON(ex == NULL); + BUG_ON(eh == NULL); + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller then current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + if ((err = ext3_ext_get_access(handle, inode, path + k))) + return err; + path[k].p_idx->ei_block = border; + if ((err = ext3_ext_dirty(handle, inode, path + k))) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + if ((err = ext3_ext_get_access(handle, inode, path + k))) + break; + path[k].p_idx->ei_block = border; + if ((err = ext3_ext_dirty(handle, inode, path + k))) + break; + } + + return err; +} + +static int inline +ext3_can_extents_be_merged(struct inode *inode, struct ext3_extent *ex1, + struct ext3_extent *ex2) +{ + if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) + != le32_to_cpu(ex2->ee_block)) + return 0; + + /* + * To allow future support for preallocated extents to be added + * as an RO_COMPAT feature, refuse to merge to extents if + * can result in the top bit of ee_len being set + */ + if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) + return 0; +#ifdef AGRESSIVE_TEST + if (le16_to_cpu(ex1->ee_len) >= 4) + return 0; +#endif + + if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2)) + return 1; + return 0; +} + +/* + * this routine tries to merge requsted extent into the existing + * extent or inserts requested extent as new one into the tree, + * creating new leaf in no-space case + */ +int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_extent_header * eh; + struct ext3_extent *ex, *fex; + struct ext3_extent *nearex; /* nearest extent */ + struct ext3_ext_path *npath = NULL; + int depth, len, err, next; + + BUG_ON(newext->ee_len == 0); + depth = ext_depth(inode); + ex = path[depth].p_ext; + BUG_ON(path[depth].p_hdr == NULL); + + /* try to insert block into found extent and return */ + if (ex && ext3_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append %d block to %d:%d (from "E3FSBLK")\n", + le16_to_cpu(newext->ee_len), + le32_to_cpu(ex->ee_block), + le16_to_cpu(ex->ee_len), ext_pblock(ex)); + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + return err; + ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) + + le16_to_cpu(newext->ee_len)); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +repeat: + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = ext3_ext_next_leaf_block(inode, path); + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) + && next != EXT_MAX_BLOCK) { + ext_debug("next leaf block - %d\n", next); + BUG_ON(npath != NULL); + npath = ext3_ext_find_extent(inode, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug("next leaf isnt full(%d)\n", + le16_to_cpu(eh->eh_entries)); + path = npath; + goto repeat; + } + ext_debug("next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + } + + /* + * there is no free space in found leaf + * we're gonna add new leaf in the tree + */ + err = ext3_ext_create_new_leaf(handle, inode, path, newext); + if (err) + goto cleanup; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug("first extent in the leaf: %d:"E3FSBLK":%d\n", + le32_to_cpu(newext->ee_block), + ext_pblock(newext), + le16_to_cpu(newext->ee_len)); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { +/* BUG_ON(newext->ee_block == nearex->ee_block); */ + if (nearex != EXT_LAST_EXTENT(eh)) { + len = EXT_MAX_EXTENT(eh) - nearex; + len = (len - 1) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug("insert %d:"E3FSBLK":%d after: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext_pblock(newext), + le16_to_cpu(newext->ee_len), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 2, nearex + 1, len); + } + path[depth].p_ext = nearex + 1; + } else { + BUG_ON(newext->ee_block == nearex->ee_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug("insert %d:"E3FSBLK":%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext_pblock(newext), + le16_to_cpu(newext->ee_len), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } + + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1); + nearex = path[depth].p_ext; + nearex->ee_block = newext->ee_block; + nearex->ee_start = newext->ee_start; + nearex->ee_start_hi = newext->ee_start_hi; + nearex->ee_len = newext->ee_len; + +merge: + /* try to merge extents to the right */ + while (nearex < EXT_LAST_EXTENT(eh)) { + if (!ext3_can_extents_be_merged(inode, nearex, nearex + 1)) + break; + /* merge with next extent! */ + nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len) + + le16_to_cpu(nearex[1].ee_len)); + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - nearex - 1) + * sizeof(struct ext3_extent); + memmove(nearex + 1, nearex + 2, len); + } + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); + BUG_ON(eh->eh_entries == 0); + } + + /* try to merge extents to the left */ + + /* time to correct all indexes above */ + err = ext3_ext_correct_indexes(handle, inode, path); + if (err) + goto cleanup; + + err = ext3_ext_dirty(handle, inode, path + depth); + +cleanup: + if (npath) { + ext3_ext_drop_refs(npath); + kfree(npath); + } + ext3_ext_tree_changed(inode); + ext3_ext_invalidate_cache(inode); + return err; +} + +int ext3_ext_walk_space(struct inode *inode, unsigned long block, + unsigned long num, ext_prepare_callback func, + void *cbdata) +{ + struct ext3_ext_path *path = NULL; + struct ext3_ext_cache cbex; + struct ext3_extent *ex; + unsigned long next, start = 0, end = 0; + unsigned long last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext3_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext3_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= + le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT3_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = le16_to_cpu(ex->ee_len); + cbex.ec_start = ext_pblock(ex); + cbex.ec_type = EXT3_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, cbdata); + ext3_ext_drop_refs(path); + + if (err < 0) + break; + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext3_ext_drop_refs(path); + kfree(path); + } + + return err; +} + +static inline void +ext3_ext_put_in_cache(struct inode *inode, __u32 block, + __u32 len, __u32 start, int type) +{ + struct ext3_ext_cache *cex; + BUG_ON(len == 0); + cex = &EXT3_I(inode)->i_cached_extent; + cex->ec_type = type; + cex->ec_block = block; + cex->ec_len = len; + cex->ec_start = start; +} + +/* + * this routine calculate boundaries of the gap requested block fits into + * and cache this gap + */ +static inline void +ext3_ext_put_gap_in_cache(struct inode *inode, struct ext3_ext_path *path, + unsigned long block) +{ + int depth = ext_depth(inode); + unsigned long lblock, len; + struct ext3_extent *ex; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCK; + ext_debug("cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; + len = le32_to_cpu(ex->ee_block) - block; + ext_debug("cache gap(before): %lu [%lu:%lu]", + (unsigned long) block, + (unsigned long) le32_to_cpu(ex->ee_block), + (unsigned long) le16_to_cpu(ex->ee_len)); + } else if (block >= le32_to_cpu(ex->ee_block) + + le16_to_cpu(ex->ee_len)) { + lblock = le32_to_cpu(ex->ee_block) + + le16_to_cpu(ex->ee_len); + len = ext3_ext_next_allocated_block(path); + ext_debug("cache gap(after): [%lu:%lu] %lu", + (unsigned long) le32_to_cpu(ex->ee_block), + (unsigned long) le16_to_cpu(ex->ee_len), + (unsigned long) block); + BUG_ON(len == lblock); + len = len - lblock; + } else { + lblock = len = 0; + BUG(); + } + + ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len); + ext3_ext_put_in_cache(inode, lblock, len, 0, EXT3_EXT_CACHE_GAP); +} + +static inline int +ext3_ext_in_cache(struct inode *inode, unsigned long block, + struct ext3_extent *ex) +{ + struct ext3_ext_cache *cex; + + cex = &EXT3_I(inode)->i_cached_extent; + + /* has cache valid data? */ + if (cex->ec_type == EXT3_EXT_CACHE_NO) + return EXT3_EXT_CACHE_NO; + + BUG_ON(cex->ec_type != EXT3_EXT_CACHE_GAP && + cex->ec_type != EXT3_EXT_CACHE_EXTENT); + if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + ex->ee_block = cpu_to_le32(cex->ec_block); + ext3_ext_store_pblock(ex, cex->ec_start); + ex->ee_len = cpu_to_le16(cex->ec_len); + ext_debug("%lu cached by %lu:%lu:"E3FSBLK"\n", + (unsigned long) block, + (unsigned long) cex->ec_block, + (unsigned long) cex->ec_len, + cex->ec_start); + return cex->ec_type; + } + + /* not in cache */ + return EXT3_EXT_CACHE_NO; +} + +/* + * routine removes index from the index block + * it's used in truncate case only. thus all requests are for + * last index in the block only + */ +int ext3_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + struct buffer_head *bh; + int err; + ext3_fsblk_t leaf; + + /* free index block */ + path--; + leaf = idx_pblock(path->p_idx); + BUG_ON(path->p_hdr->eh_entries == 0); + if ((err = ext3_ext_get_access(handle, inode, path))) + return err; + path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1); + if ((err = ext3_ext_dirty(handle, inode, path))) + return err; + ext_debug("index is empty, remove it, free block "E3FSBLK"\n", leaf); + bh = sb_find_get_block(inode->i_sb, leaf); + ext3_forget(handle, 1, inode, bh, leaf); + ext3_free_blocks(handle, inode, leaf, 1); + return err; +} + +/* + * This routine returns max. credits extent tree can consume. + * It should be OK for low-performance paths like ->writepage() + * To allow many writing process to fit a single transaction, + * caller should calculate credits under truncate_mutex and + * pass actual path. + */ +int inline ext3_ext_calc_credits_for_insert(struct inode *inode, + struct ext3_ext_path *path) +{ + int depth, needed; + + if (path) { + /* probably there is space in leaf? */ + depth = ext_depth(inode); + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) + return 1; + } + + /* + * given 32bit logical block (4294967296 blocks), max. tree + * can be 4 levels in depth -- 4 * 340^4 == 53453440000. + * let's also add one more level for imbalance. + */ + depth = 5; + + /* allocation of new data block(s) */ + needed = 2; + + /* + * tree can be full, so it'd need to grow in depth: + * allocation + old root + new root + */ + needed += 2 + 1 + 1; + + /* + * Index split can happen, we'd need: + * allocate intermediate indexes (bitmap + group) + * + change two blocks at each level, but root (already included) + */ + needed = (depth * 2) + (depth * 2); + + /* any allocation modifies superblock */ + needed += 1; + + return needed; +} + +static int ext3_remove_blocks(handle_t *handle, struct inode *inode, + struct ext3_extent *ex, + unsigned long from, unsigned long to) +{ + struct buffer_head *bh; + int i; + +#ifdef EXTENTS_STATS + { + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + unsigned short ee_len = le16_to_cpu(ex->ee_len); + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); + } +#endif + if (from >= le32_to_cpu(ex->ee_block) + && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + /* tail removal */ + unsigned long num; + ext3_fsblk_t start; + num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; + start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num; + ext_debug("free last %lu blocks starting "E3FSBLK"\n", num, start); + for (i = 0; i < num; i++) { + bh = sb_find_get_block(inode->i_sb, start + i); + ext3_forget(handle, 0, inode, bh, start + i); + } + ext3_free_blocks(handle, inode, start, num); + } else if (from == le32_to_cpu(ex->ee_block) + && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + } else { + printk("strange request: removal(2) %lu-%lu from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + } + return 0; +} + +static int +ext3_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, unsigned long start) +{ + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits; + struct ext3_extent_header *eh; + unsigned a, b, block, num; + unsigned long ex_ee_block; + unsigned short ex_ee_len; + struct ext3_extent *ex; + + ext_debug("truncate since %lu in leaf\n", start); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + BUG_ON(eh == NULL); + BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); + BUG_ON(eh->eh_magic != EXT3_EXT_MAGIC); + + /* find where to start removing */ + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = le16_to_cpu(ex->ee_len); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); + path[depth].p_ext = ex; + + a = ex_ee_block > start ? ex_ee_block : start; + b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? + ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; + + ext_debug(" border %u:%u\n", a, b); + + if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { + block = 0; + num = 0; + BUG(); + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + block = ex_ee_block; + num = a - block; + } else if (b != ex_ee_block + ex_ee_len - 1) { + /* remove head of the extent */ + block = a; + num = b - a; + /* there is no "make a hole" API yet */ + BUG(); + } else { + /* remove whole extent: excellent! */ + block = ex_ee_block; + num = 0; + BUG_ON(a != ex_ee_block); + BUG_ON(b != ex_ee_block + ex_ee_len - 1); + } + + /* at present, extent can't cross block group */ + /* leaf + bitmap + group desc + sb + inode */ + credits = 5; + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } +#ifdef CONFIG_QUOTA + credits += 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); +#endif + + handle = ext3_ext_journal_restart(handle, credits); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + err = ext3_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext3_remove_blocks(handle, inode, ex, a, b); + if (err) + goto out; + + if (num == 0) { + /* this extent is removed entirely mark slot unused */ + ext3_ext_store_pblock(ex, 0); + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); + } + + ex->ee_block = cpu_to_le32(block); + ex->ee_len = cpu_to_le16(num); + + err = ext3_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug("new extent: %u:%u:"E3FSBLK"\n", block, num, + ext_pblock(ex)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = le16_to_cpu(ex->ee_len); + } + + if (correct_index && eh->eh_entries) + err = ext3_ext_correct_indexes(handle, inode, path); + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext3_ext_rm_idx(handle, inode, path + depth); + +out: + return err; +} + +/* + * returns 1 if current index have to be freed (even partial) + */ +static int inline +ext3_ext_more_to_rm(struct ext3_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened it it wasn't partial + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +int ext3_ext_remove_space(struct inode *inode, unsigned long start) +{ + struct super_block *sb = inode->i_sb; + int depth = ext_depth(inode); + struct ext3_ext_path *path; + handle_t *handle; + int i = 0, err = 0; + + ext_debug("truncate since %lu\n", start); + + /* probably first extent we're gonna free will be last in block */ + handle = ext3_journal_start(inode, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext3_ext_invalidate_cache(inode); + + /* + * we start scanning from right side freeing all the blocks + * after i_size and walking into the deep + */ + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); + if (path == NULL) { + ext3_journal_stop(handle); + return -ENOMEM; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + path[0].p_hdr = ext_inode_hdr(inode); + if (ext3_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) { + err = -EIO; + goto out; + } + path[0].p_depth = depth; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext3_ext_rm_leaf(handle, inode, path, start); + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug("initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + if (ext3_ext_check_header(__FUNCTION__, inode, + path[i].p_hdr)) { + err = -EIO; + goto out; + } + } + + BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries) + > le16_to_cpu(path[i].p_hdr->eh_max)); + BUG_ON(path[i].p_hdr->eh_magic != EXT3_EXT_MAGIC); + + if (!path[i].p_idx) { + /* this level hasn't touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug("init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we've already was here, see at next index */ + path[i].p_idx--; + } + + ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext3_ext_more_to_rm(path + i)) { + /* go to the next level */ + ext_debug("move to level %d (block "E3FSBLK")\n", + i + 1, idx_pblock(path[i].p_idx)); + memset(path + i + 1, 0, sizeof(*path)); + path[i+1].p_bh = + sb_bread(sb, idx_pblock(path[i].p_idx)); + if (!path[i+1].p_bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + + /* put actual number of indexes to know is this + * number got changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finish processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext3_ext_rm_idx(handle, inode, path + i); + } + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + ext_debug("return to level %d\n", i); + } + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree + * so, we need to correct eh_depth + */ + err = ext3_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext3_ext_space_root(inode)); + err = ext3_ext_dirty(handle, inode, path); + } + } +out: + ext3_ext_tree_changed(inode); + ext3_ext_drop_refs(path); + kfree(path); + ext3_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext3_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (test_opt(sb, EXTENTS)) { + printk("EXT3-fs: file extents enabled"); +#ifdef AGRESSIVE_TEST + printk(", agressive tests"); +#endif +#ifdef CHECK_BINSEARCH + printk(", check binsearch"); +#endif +#ifdef EXTENTS_STATS + printk(", stats"); +#endif + printk("\n"); +#ifdef EXTENTS_STATS + spin_lock_init(&EXT3_SB(sb)->s_ext_stats_lock); + EXT3_SB(sb)->s_ext_min = 1 << 30; + EXT3_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext3_ext_release(struct super_block *sb) +{ + if (!test_opt(sb, EXTENTS)) + return; + +#ifdef EXTENTS_STATS + if (EXT3_SB(sb)->s_ext_blocks && EXT3_SB(sb)->s_ext_extents) { + struct ext3_sb_info *sbi = EXT3_SB(sb); + printk(KERN_ERR "EXT3-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT3-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +int ext3_ext_get_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create, int extend_disksize) +{ + struct ext3_ext_path *path = NULL; + struct ext3_extent newex, *ex; + ext3_fsblk_t goal, newblock; + int err = 0, depth; + unsigned long allocated = 0; + + __clear_bit(BH_New, &bh_result->b_state); + ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock, + max_blocks, (unsigned) inode->i_ino); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + + /* check in cache */ + if ((goal = ext3_ext_in_cache(inode, iblock, &newex))) { + if (goal == EXT3_EXT_CACHE_GAP) { + if (!create) { + /* block isn't allocated yet and + * user don't want to allocate it */ + goto out2; + } + /* we should allocate requested block */ + } else if (goal == EXT3_EXT_CACHE_EXTENT) { + /* block is already allocated */ + newblock = iblock + - le32_to_cpu(newex.ee_block) + + ext_pblock(&newex); + /* number of remain blocks in the extent */ + allocated = le16_to_cpu(newex.ee_len) - + (iblock - le32_to_cpu(newex.ee_block)); + goto out; + } else { + BUG(); + } + } + + /* find extent for this block */ + path = ext3_ext_find_extent(inode, iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty + * this situations is possible, though, _during_ tree modification + * this is why assert can't be put in ext3_ext_find_extent() + */ + BUG_ON(path[depth].p_ext == NULL && depth != 0); + + if ((ex = path[depth].p_ext)) { + unsigned long ee_block = le32_to_cpu(ex->ee_block); + ext3_fsblk_t ee_start = ext_pblock(ex); + unsigned short ee_len = le16_to_cpu(ex->ee_len); + + /* + * Allow future support for preallocated extents to be added + * as an RO_COMPAT feature: + * Uninitialized extents are treated as holes, except that + * we avoid (fail) allocating new blocks during a write. + */ + if (ee_len > EXT_MAX_LEN) + goto out2; + /* if found exent covers block, simple return it */ + if (iblock >= ee_block && iblock < ee_block + ee_len) { + newblock = iblock - ee_block + ee_start; + /* number of remain blocks in the extent */ + allocated = ee_len - (iblock - ee_block); + ext_debug("%d fit into %lu:%d -> "E3FSBLK"\n", (int) iblock, + ee_block, ee_len, newblock); + ext3_ext_put_in_cache(inode, ee_block, ee_len, + ee_start, EXT3_EXT_CACHE_EXTENT); + goto out; + } + } + + /* + * requested block isn't allocated yet + * we couldn't try to create block if create flag is zero + */ + if (!create) { + /* put just found gap into cache to speedup subsequest reqs */ + ext3_ext_put_gap_in_cache(inode, path, iblock); + goto out2; + } + + /* allocate new block */ + goal = ext3_ext_find_goal(inode, path, iblock); + allocated = max_blocks; + newblock = ext3_new_blocks(handle, inode, goal, &allocated, &err); + if (!newblock) + goto out2; + ext_debug("allocate new block: goal "E3FSBLK", found "E3FSBLK"/%lu\n", + goal, newblock, allocated); + + /* try to insert new extent into found leaf and return */ + newex.ee_block = cpu_to_le32(iblock); + ext3_ext_store_pblock(&newex, newblock); + newex.ee_len = cpu_to_le16(allocated); + err = ext3_ext_insert_extent(handle, inode, path, &newex); + if (err) + goto out2; + + if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = inode->i_size; + + /* previous routine could use block we allocated */ + newblock = ext_pblock(&newex); + __set_bit(BH_New, &bh_result->b_state); + + ext3_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT3_EXT_CACHE_EXTENT); +out: + if (allocated > max_blocks) + allocated = max_blocks; + ext3_ext_show_leaf(inode, path); + __set_bit(BH_Mapped, &bh_result->b_state); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext3_ext_drop_refs(path); + kfree(path); + } + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + + return err ? err : allocated; +} + +void ext3_ext_truncate(struct inode * inode, struct page *page) +{ + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + unsigned long last_block; + handle_t *handle; + int err = 0; + + /* + * probably first extent we're gonna free will be last in block + */ + err = ext3_writepage_trans_blocks(inode) + 3; + handle = ext3_journal_start(inode, err); + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } + return; + } + + if (page) + ext3_block_truncate_page(handle, page, mapping, inode->i_size); + + mutex_lock(&EXT3_I(inode)->truncate_mutex); + ext3_ext_invalidate_cache(inode); + + /* + * TODO: optimization is possible here + * probably we need not scaning at all, + * because page truncation is enough + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* we have to know where to truncate from in crash case */ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT3_BLOCK_SIZE_BITS(sb); + err = ext3_ext_remove_space(inode, last_block); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ext3_journal_stop(handle); +} + +/* + * this routine calculate max number of blocks we could modify + * in order to allocate new block for an inode + */ +int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) +{ + int needed; + + needed = ext3_ext_calc_credits_for_insert(inode, NULL); + + /* caller want to allocate num blocks, but note it includes sb */ + needed = needed * num - (num - 1); + +#ifdef CONFIG_QUOTA + needed += 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); +#endif + + return needed; +} + +EXPORT_SYMBOL(ext3_mark_inode_dirty); +EXPORT_SYMBOL(ext3_ext_invalidate_cache); +EXPORT_SYMBOL(ext3_ext_insert_extent); +EXPORT_SYMBOL(ext3_ext_walk_space); +EXPORT_SYMBOL(ext3_ext_find_goal); +EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); + diff -puN fs/ext3/ialloc.c~ext3dev-2.6.17-git13 fs/ext3/ialloc.c --- linux-2.6.17-git13/fs/ext3/ialloc.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.932867616 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/ialloc.c 2006-06-28 17:45:21.075851210 -0700 @@ -23,7 +23,7 @@ #include #include #include - +#include #include #include "xattr.h" @@ -60,12 +60,14 @@ read_inode_bitmap(struct super_block * s if (!desc) goto error_out; - bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + bh = sb_bread(sb, EXT3_INODE_BITMAP(desc, + ext3_group_first_block_no(sb, block_group))); if (!bh) ext3_error(sb, "read_inode_bitmap", "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %u", - block_group, le32_to_cpu(desc->bg_inode_bitmap)); + "block_group = %lu, inode_bitmap = %llu", + block_group, EXT3_INODE_BITMAP(desc, + ext3_group_first_block_no(sb, block_group))); error_out: return bh; } @@ -274,7 +276,8 @@ static int find_group_orlov(struct super freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - avefreeb = freeb / ngroups; + avefreeb = freeb; + sector_div(avefreeb, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if ((parent == sb->s_root->d_inode) || @@ -303,13 +306,15 @@ static int find_group_orlov(struct super goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; + blocks_per_dir = EXT3_BLOCKS_COUNT(es) - freeb; + sector_div(blocks_per_dir, ndirs); max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); + max_debt = EXT3_BLOCKS_PER_GROUP(sb); + sector_div(max_debt, max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST)); if (max_debt * INODE_COST > inodes_per_group) max_debt = inodes_per_group / INODE_COST; if (max_debt > 255) @@ -616,6 +621,17 @@ got: ext3_std_error(sb, err); goto fail_free_drop; } + if (test_opt(sb, EXTENTS)) { + EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; + ext3_ext_tree_init(handle, inode); + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) goto fail; + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); + BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + } + } ext3_debug("allocating inode %lu\n", inode->i_ino); goto really_out; diff -puN fs/ext3/inode.c~ext3dev-2.6.17-git13 fs/ext3/inode.c --- linux-2.6.17-git13/fs/ext3/inode.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.937867042 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/inode.c 2006-06-28 17:45:21.082850407 -0700 @@ -39,8 +39,6 @@ #include "xattr.h" #include "acl.h" -static int ext3_writepage_trans_blocks(struct inode *inode); - /* * Test whether an inode is a fast symlink. */ @@ -803,6 +801,7 @@ int ext3_get_blocks_handle(handle_t *han ext3_fsblk_t first_block = 0; + J_ASSERT(!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -983,7 +982,7 @@ static int ext3_get_block(struct inode * get_block: if (ret == 0) { - ret = ext3_get_blocks_handle(handle, inode, iblock, + ret = ext3_get_blocks_wrap(handle, inode, iblock, max_blocks, bh_result, create, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); @@ -1007,7 +1006,7 @@ struct buffer_head *ext3_getblk(handle_t dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext3_get_blocks_handle(handle, inode, block, 1, + err = ext3_get_blocks_wrap(handle, inode, block, 1, &dummy, create, 1); if (err == 1) { err = 0; @@ -1755,7 +1754,7 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, +int ext3_block_truncate_page(handle_t *handle, struct page *page, struct address_space *mapping, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -2259,6 +2258,9 @@ void ext3_truncate(struct inode *inode) return; } + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_truncate(inode, page); + handle = start_transaction(inode); if (IS_ERR(handle)) { if (page) { @@ -2431,8 +2433,9 @@ static ext3_fsblk_t ext3_get_inode_block */ offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * EXT3_INODE_SIZE(sb); - block = le32_to_cpu(gdp[desc].bg_inode_table) + - (offset >> EXT3_BLOCK_SIZE_BITS(sb)); + block = EXT3_INODE_TABLE((gdp+desc), + ext3_group_first_block_no(sb, block_group)) + + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); iloc->block_group = block_group; iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); @@ -2499,7 +2502,9 @@ static int __ext3_get_inode_loc(struct i goto make_io; bitmap_bh = sb_getblk(inode->i_sb, - le32_to_cpu(desc->bg_inode_bitmap)); + EXT3_INODE_BITMAP(desc, + ext3_group_first_block_no(inode->i_sb, + block_group))); if (!bitmap_bh) goto make_io; @@ -2639,6 +2644,10 @@ void ext3_read_inode(struct inode * inod ei->i_frag_size = raw_inode->i_fsize; #endif ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if ((sizeof(sector_t) > 4) && + (EXT3_SB(inode->i_sb)->s_es->s_creator_os != EXT3_OS_HURD)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; if (!S_ISREG(inode->i_mode)) { ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); } else { @@ -2772,6 +2781,10 @@ static int ext3_do_update_inode(handle_t raw_inode->i_frag = ei->i_frag_no; raw_inode->i_fsize = ei->i_frag_size; #endif + if ((sizeof(sector_t) > 4) && + (EXT3_SB(inode->i_sb)->s_es->s_creator_os != EXT3_OS_HURD)) + raw_inode->i_file_acl_high = + cpu_to_le16((__u64)ei->i_file_acl >> 32); raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); @@ -3001,12 +3014,15 @@ err_out: * block and work out the exact number of indirects which are touched. Pah. */ -static int ext3_writepage_trans_blocks(struct inode *inode) +int ext3_writepage_trans_blocks(struct inode *inode) { int bpp = ext3_journal_blocks_per_page(inode); int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_writepage_trans_blocks(inode, bpp); + if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else diff -puN fs/ext3/ioctl.c~ext3dev-2.6.17-git13 fs/ext3/ioctl.c --- linux-2.6.17-git13/fs/ext3/ioctl.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.940866698 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/ioctl.c 2006-06-28 17:45:21.083850292 -0700 @@ -247,7 +247,6 @@ flags_err: return err; } - default: return -ENOTTY; } diff -puN fs/ext3/Makefile~ext3dev-2.6.17-git13 fs/ext3/Makefile --- linux-2.6.17-git13/fs/ext3/Makefile~ext3dev-2.6.17-git13 2006-06-28 17:45:20.943866354 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/Makefile 2006-06-28 17:45:21.084850177 -0700 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff -puN fs/ext3/resize.c~ext3dev-2.6.17-git13 fs/ext3/resize.c --- linux-2.6.17-git13/fs/ext3/resize.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.946866010 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/resize.c 2006-06-28 17:45:21.086849948 -0700 @@ -15,7 +15,6 @@ #include #include #include - #include #include @@ -28,7 +27,7 @@ static int verify_group_input(struct sup { struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; - ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count); + ext3_fsblk_t start = EXT3_BLOCKS_COUNT(es); ext3_fsblk_t end = start + input->blocks_count; unsigned group = input->group; ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; @@ -37,7 +36,7 @@ static int verify_group_input(struct sup le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; ext3_fsblk_t metaend = start + overhead; struct buffer_head *bh = NULL; - ext3_grpblk_t free_blocks_count; + ext3_grpblk_t free_blocks_count, offset; int err = -EINVAL; input->free_blocks_count = free_blocks_count = @@ -50,13 +49,13 @@ static int verify_group_input(struct sup "no-super", input->group, input->blocks_count, free_blocks_count, input->reserved_blocks); + ext3_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) ext3_warning(sb, __FUNCTION__, "Cannot add at group %u (only %lu groups)", input->group, sbi->s_groups_count); - else if ((start - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb)) - ext3_warning(sb, __FUNCTION__, "Last group not full"); + else if (offset != 0) + ext3_warning(sb, __FUNCTION__, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", input->reserved_blocks); @@ -818,9 +817,12 @@ int ext3_group_add(struct super_block *s /* Update group descriptor block for new group */ gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; - gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); - gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); - gdp->bg_inode_table = cpu_to_le32(input->inode_table); + EXT3_BLOCK_BITMAP_SET(gdp, ext3_group_first_block_no(sb, gdb_num), + input->block_bitmap); /* LV FIXME */ + EXT3_INODE_BITMAP_SET(gdp, ext3_group_first_block_no(sb, gdb_num), + input->inode_bitmap); /* LV FIXME */ + EXT3_INODE_TABLE_SET(gdp, ext3_group_first_block_no(sb, gdb_num), + input->inode_table); /* LV FIXME */ gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); @@ -834,7 +836,7 @@ int ext3_group_add(struct super_block *s * blocks/inodes before the group is live won't actually let us * allocate the new space yet. */ - es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) + + EXT3_BLOCKS_COUNT_SET(es, EXT3_BLOCKS_COUNT(es) + input->blocks_count); es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb)); @@ -870,7 +872,7 @@ int ext3_group_add(struct super_block *s /* Update the reserved block counts only once the new group is * active. */ - es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) + + EXT3_R_BLOCKS_COUNT_SET(es, EXT3_R_BLOCKS_COUNT(es) + input->reserved_blocks); /* Update the free space counts */ @@ -921,7 +923,7 @@ int ext3_group_extend(struct super_block /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after * taking lock_super() below. */ - o_blocks_count = le32_to_cpu(es->s_blocks_count); + o_blocks_count = EXT3_BLOCKS_COUNT(es); o_groups_count = EXT3_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) @@ -933,7 +935,7 @@ int ext3_group_extend(struct super_block if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to resize to %lu blocks safely\n", + " too large to resize to "E3FSBLK" blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) ext3_warning(sb, __FUNCTION__, @@ -948,8 +950,7 @@ int ext3_group_extend(struct super_block } /* Handle the remaining blocks in the last group only. */ - last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb); + ext3_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); if (last == 0) { ext3_warning(sb, __FUNCTION__, @@ -988,7 +989,7 @@ int ext3_group_extend(struct super_block } lock_super(sb); - if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + if (o_blocks_count != EXT3_BLOCKS_COUNT(es)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); unlock_super(sb); @@ -1004,7 +1005,7 @@ int ext3_group_extend(struct super_block ext3_journal_stop(handle); goto exit_put; } - es->s_blocks_count = cpu_to_le32(o_blocks_count + add); + EXT3_BLOCKS_COUNT_SET(es, o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); sb->s_dirt = 1; unlock_super(sb); @@ -1016,8 +1017,8 @@ int ext3_group_extend(struct super_block if ((err = ext3_journal_stop(handle))) goto exit_put; if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", - le32_to_cpu(es->s_blocks_count)); + printk(KERN_DEBUG "EXT3-fs: extended group to %llu blocks\n", + EXT3_BLOCKS_COUNT(es)); update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, sizeof(struct ext3_super_block)); exit_put: diff -puN fs/ext3/super.c~ext3dev-2.6.17-git13 fs/ext3/super.c --- linux-2.6.17-git13/fs/ext3/super.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.975862683 -0700 +++ linux-2.6.17-git13-ming/fs/ext3/super.c 2006-06-28 17:45:21.091849374 -0700 @@ -390,6 +390,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; int i; + ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { @@ -454,6 +455,7 @@ static struct inode *ext3_alloc_inode(st #endif ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + memset(&ei->i_cached_extent, 0, sizeof(struct ext3_ext_cache)); return &ei->vfs_inode; } @@ -636,7 +638,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota + Opt_grpquota, Opt_extents, }; static match_table_t tokens = { @@ -686,6 +688,7 @@ static match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_extents, "extents"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -1018,6 +1021,9 @@ clear_qf_name: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1145,44 +1151,48 @@ static int ext3_check_descriptors (struc if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext3_group_desc *) sbi->s_group_desc[desc_block++]->b_data; - if (le32_to_cpu(gdp->bg_block_bitmap) < block || - le32_to_cpu(gdp->bg_block_bitmap) >= + if (EXT3_BLOCK_BITMAP(gdp, ext3_group_first_block_no(sb, i)) < + block || + EXT3_BLOCK_BITMAP(gdp, ext3_group_first_block_no(sb, i)) >= block + EXT3_BLOCKS_PER_GROUP(sb)) { ext3_error (sb, "ext3_check_descriptors", "Block bitmap for group %d" " not in group (block %lu)!", i, (unsigned long) - le32_to_cpu(gdp->bg_block_bitmap)); + EXT3_BLOCK_BITMAP(gdp, ext3_group_first_block_no(sb, i))); return 0; } - if (le32_to_cpu(gdp->bg_inode_bitmap) < block || - le32_to_cpu(gdp->bg_inode_bitmap) >= + if (EXT3_INODE_BITMAP(gdp, ext3_group_first_block_no(sb, i)) < + block || + EXT3_INODE_BITMAP(gdp, ext3_group_first_block_no(sb, i)) >= block + EXT3_BLOCKS_PER_GROUP(sb)) { ext3_error (sb, "ext3_check_descriptors", "Inode bitmap for group %d" " not in group (block %lu)!", i, (unsigned long) - le32_to_cpu(gdp->bg_inode_bitmap)); + EXT3_INODE_BITMAP(gdp, ext3_group_first_block_no(sb, i))); return 0; } - if (le32_to_cpu(gdp->bg_inode_table) < block || - le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= - block + EXT3_BLOCKS_PER_GROUP(sb)) + if (EXT3_INODE_TABLE(gdp, ext3_group_first_block_no(sb, i)) < + block || + EXT3_INODE_TABLE(gdp, ext3_group_first_block_no(sb, i)) + + sbi->s_itb_per_group >= + block + EXT3_BLOCKS_PER_GROUP(sb)) { ext3_error (sb, "ext3_check_descriptors", "Inode table for group %d" " not in group (block %lu)!", i, (unsigned long) - le32_to_cpu(gdp->bg_inode_table)); + EXT3_INODE_TABLE(gdp, ext3_group_first_block_no(sb, i))); return 0; } block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } - sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); + EXT3_FREE_BLOCKS_COUNT_SET(sbi->s_es, ext3_count_free_blocks(sb)); sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb)); return 1; } @@ -1359,6 +1369,7 @@ static int ext3_fill_super (struct super int i; int needs_recovery; __le32 features; + __u64 blocks_count; sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -1382,8 +1393,8 @@ static int ext3_fill_super (struct super * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT3_MIN_BLOCK_SIZE) { - logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; - offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + logic_sb_block = sb_block * EXT3_MIN_BLOCK_SIZE; + offset = sector_div(logic_sb_block, blocksize); } else { logic_sb_block = sb_block; } @@ -1488,8 +1499,8 @@ static int ext3_fill_super (struct super brelse (bh); sb_set_blocksize(sb, blocksize); - logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; - offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + logic_sb_block = sb_block * EXT3_MIN_BLOCK_SIZE; + offset = sector_div(logic_sb_block, blocksize); bh = sb_bread(sb, logic_sb_block); if (!bh) { printk(KERN_ERR @@ -1569,7 +1580,7 @@ static int ext3_fill_super (struct super goto failed_mount; } - if (le32_to_cpu(es->s_blocks_count) > + if (EXT3_BLOCKS_COUNT(es) > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { printk(KERN_ERR "EXT3-fs: filesystem on %s:" " too large to mount safely\n", sb->s_id); @@ -1581,10 +1592,11 @@ static int ext3_fill_super (struct super if (EXT3_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext3; - sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) + - EXT3_BLOCKS_PER_GROUP(sb) - 1) / - EXT3_BLOCKS_PER_GROUP(sb); + blocks_count = (EXT3_BLOCKS_COUNT(es) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT3_BLOCKS_PER_GROUP(sb)); + sbi->s_groups_count = blocks_count; db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / EXT3_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), @@ -1743,6 +1755,8 @@ static int ext3_fill_super (struct super test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + ext3_ext_init(sb); + lock_kernel(); return 0; @@ -1896,7 +1910,7 @@ static journal_t *ext3_get_dev_journal(s goto out_bdev; } - len = le32_to_cpu(es->s_blocks_count); + len = EXT3_BLOCKS_COUNT(es); start = sb_block + 1; brelse(bh); /* we're done with the superblock */ @@ -2066,7 +2080,7 @@ static void ext3_commit_super (struct su if (!sbh) return; es->s_wtime = cpu_to_le32(get_seconds()); - es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); + EXT3_FREE_BLOCKS_COUNT_SET(es, ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -2259,7 +2273,7 @@ static int ext3_remount (struct super_bl ext3_init_journal_params(sb, sbi->s_journal); if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > le32_to_cpu(es->s_blocks_count)) { + n_blocks_count > EXT3_BLOCKS_COUNT(es)) { if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { err = -EROFS; goto restore_opts; @@ -2380,10 +2394,10 @@ static int ext3_statfs (struct dentry * buf->f_type = EXT3_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_blocks = EXT3_BLOCKS_COUNT(es) - overhead; buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); - buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); - if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = buf->f_bfree - EXT3_R_BLOCKS_COUNT(es); + if (buf->f_bfree < EXT3_R_BLOCKS_COUNT(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); diff -puN fs/jbd/commit.c~ext3dev-2.6.17-git13 fs/jbd/commit.c --- linux-2.6.17-git13/fs/jbd/commit.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.980862109 -0700 +++ linux-2.6.17-git13-ming/fs/jbd/commit.c 2006-06-28 17:45:21.093849145 -0700 @@ -160,6 +160,14 @@ static int journal_write_commit_record(j return (ret == -EIO); } +static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, + sector_t block) +{ + tag->t_blocknr = cpu_to_be32(block & (u32)~0); + if (tag_bytes > JBD_TAG_SIZE32) + tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); +} + /* * journal_commit_transaction * @@ -174,7 +182,7 @@ void journal_commit_transaction(journal_ int bufs; int flags; int err; - unsigned long blocknr; + sector_t blocknr; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; @@ -182,6 +190,7 @@ void journal_commit_transaction(journal_ int first_tag = 0; int tag_flag; int i; + int tag_bytes = journal_tag_bytes(journal); /* * First job: lock down the current transaction and wait for @@ -553,10 +562,10 @@ write_out_data: tag_flag |= JFS_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; - tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); + write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be32(tag_flag); - tagp += sizeof(journal_block_tag_t); - space_left -= sizeof(journal_block_tag_t); + tagp += tag_bytes; + space_left -= tag_bytes; if (first_tag) { memcpy (tagp, journal->j_uuid, 16); @@ -570,7 +579,7 @@ write_out_data: if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || - space_left < sizeof(journal_block_tag_t) + 16) { + space_left < tag_bytes + 16) { jbd_debug(4, "JBD: Submit %d IOs\n", bufs); diff -puN fs/jbd/journal.c~ext3dev-2.6.17-git13 fs/jbd/journal.c --- linux-2.6.17-git13/fs/jbd/journal.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.983861765 -0700 +++ linux-2.6.17-git13-ming/fs/jbd/journal.c 2006-06-28 17:45:21.096848800 -0700 @@ -270,7 +270,7 @@ static void journal_kill_thread(journal_ int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - int blocknr) + sector_t blocknr) { int need_copy_out = 0; int done_copy_out = 0; @@ -554,7 +554,7 @@ int log_wait_commit(journal_t *journal, * Log buffer allocation routines: */ -int journal_next_log_block(journal_t *journal, unsigned long *retp) +int journal_next_log_block(journal_t *journal, sector_t *retp) { unsigned long blocknr; @@ -578,10 +578,10 @@ int journal_next_log_block(journal_t *jo * ready. */ int journal_bmap(journal_t *journal, unsigned long blocknr, - unsigned long *retp) + sector_t *retp) { int err = 0; - unsigned long ret; + sector_t ret; if (journal->j_inode) { ret = bmap(journal->j_inode, blocknr); @@ -617,7 +617,7 @@ int journal_bmap(journal_t *journal, uns struct journal_head *journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; - unsigned long blocknr; + sector_t blocknr; int err; err = journal_next_log_block(journal, &blocknr); @@ -705,7 +705,7 @@ fail: */ journal_t * journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, - int start, int len, int blocksize) + sector_t start, int len, int blocksize) { journal_t *journal = journal_init_common(); struct buffer_head *bh; @@ -753,7 +753,7 @@ journal_t * journal_init_inode (struct i journal_t *journal = journal_init_common(); int err; int n; - unsigned long blocknr; + sector_t blocknr; if (!journal) return NULL; @@ -853,7 +853,7 @@ static int journal_reset(journal_t *jour **/ int journal_create(journal_t *journal) { - unsigned long blocknr; + sector_t blocknr; struct buffer_head *bh; journal_superblock_t *sb; int i, err; @@ -1603,6 +1603,17 @@ int journal_blocks_per_page(struct inode } /* + * helper functions to deal with 32 or 64bit block numbers. + */ +size_t journal_tag_bytes(journal_t *journal) +{ + if (JFS_HAS_INCOMPAT_FEATURE(journal, JFS_FEATURE_INCOMPAT_64BIT)) + return JBD_TAG_SIZE64; + else + return JBD_TAG_SIZE32; +} + +/* * Simple support for retrying memory allocations. Introduced to help to * debug different VM deadlock avoidance strategies. */ diff -puN fs/jbd/recovery.c~ext3dev-2.6.17-git13 fs/jbd/recovery.c --- linux-2.6.17-git13/fs/jbd/recovery.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.986861421 -0700 +++ linux-2.6.17-git13-ming/fs/jbd/recovery.c 2006-06-28 17:45:21.098848571 -0700 @@ -70,7 +70,7 @@ static int do_readahead(journal_t *journ { int err; unsigned int max, nbufs, next; - unsigned long blocknr; + sector_t blocknr; struct buffer_head *bh; struct buffer_head * bufs[MAXBUF]; @@ -132,7 +132,7 @@ static int jread(struct buffer_head **bh unsigned int offset) { int err; - unsigned long blocknr; + sector_t blocknr; struct buffer_head *bh; *bhp = NULL; @@ -178,19 +178,20 @@ static int jread(struct buffer_head **bh * Count the number of in-use tags in a journal descriptor block. */ -static int count_tags(struct buffer_head *bh, int size) +static int count_tags(journal_t *journal, struct buffer_head *bh) { char * tagp; journal_block_tag_t * tag; - int nr = 0; + int nr = 0, size = journal->j_blocksize; + int tag_bytes = journal_tag_bytes(journal); tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { + while ((tagp - bh->b_data + tag_bytes) <= size) { tag = (journal_block_tag_t *) tagp; nr++; - tagp += sizeof(journal_block_tag_t); + tagp += tag_bytes; if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) tagp += 16; @@ -307,6 +308,14 @@ int journal_skip_recovery(journal_t *jou return err; } +static inline sector_t read_tag_block(int tag_bytes, journal_block_tag_t *tag) +{ + sector_t block = be32_to_cpu(tag->t_blocknr); + if (tag_bytes > JBD_TAG_SIZE32) + block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; + return block; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -318,11 +327,12 @@ static int do_one_pass(journal_t *journa struct buffer_head * bh; unsigned int sequence; int blocktype; + int tag_bytes = journal_tag_bytes(journal); /* Precompute the maximum metadata descriptors in a descriptor block */ int MAX_BLOCKS_PER_DESC; MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / sizeof(journal_block_tag_t)); + / tag_bytes); /* * First thing is to establish what we expect to find in the log @@ -412,8 +422,7 @@ static int do_one_pass(journal_t *journa * in pass REPLAY; otherwise, just skip over the * blocks it describes. */ if (pass != PASS_REPLAY) { - next_log_block += - count_tags(bh, journal->j_blocksize); + next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); brelse(bh); continue; @@ -424,7 +433,7 @@ static int do_one_pass(journal_t *journa * getting done here! */ tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) + while ((tagp - bh->b_data + tag_bytes) <= journal->j_blocksize) { unsigned long io_block; @@ -443,10 +452,11 @@ static int do_one_pass(journal_t *journa "block %ld in log\n", err, io_block); } else { - unsigned long blocknr; + sector_t blocknr; J_ASSERT(obh != NULL); - blocknr = be32_to_cpu(tag->t_blocknr); + blocknr = read_tag_block(tag_bytes, + tag); /* If the block has been * revoked, then we're all done @@ -494,7 +504,7 @@ static int do_one_pass(journal_t *journa } skip_write: - tagp += sizeof(journal_block_tag_t); + tagp += tag_bytes; if (!(flags & JFS_FLAG_SAME_UUID)) tagp += 16; @@ -572,17 +582,24 @@ static int scan_revoke_records(journal_t { journal_revoke_header_t *header; int offset, max; + int record_len = 4; header = (journal_revoke_header_t *) bh->b_data; offset = sizeof(journal_revoke_header_t); max = be32_to_cpu(header->r_count); - while (offset < max) { - unsigned long blocknr; + if (JFS_HAS_INCOMPAT_FEATURE(journal, JFS_FEATURE_INCOMPAT_64BIT)) + record_len = 8; + + while (offset + record_len < max) { + sector_t blocknr; int err; - blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); - offset += 4; + if (record_len == 4) + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + else + blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset))); + offset += record_len; err = journal_set_revoke(journal, blocknr, sequence); if (err) return err; diff -puN fs/jbd/revoke.c~ext3dev-2.6.17-git13 fs/jbd/revoke.c --- linux-2.6.17-git13/fs/jbd/revoke.c~ext3dev-2.6.17-git13 2006-06-28 17:45:20.990860962 -0700 +++ linux-2.6.17-git13-ming/fs/jbd/revoke.c 2006-06-28 17:45:21.100848342 -0700 @@ -81,7 +81,7 @@ struct jbd_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ - unsigned long blocknr; + sector_t blocknr; }; @@ -106,17 +106,18 @@ static void flush_descriptor(journal_t * /* Utility functions to maintain the revoke table */ /* Borrowed from buffer.c: this is a tried and tested block hash function */ -static inline int hash(journal_t *journal, unsigned long block) +static inline int hash(journal_t *journal, sector_t block) { struct jbd_revoke_table_s *table = journal->j_revoke; int hash_shift = table->hash_shift; + int hash = (int)block ^ (int)((block >> 31) >> 1); - return ((block << (hash_shift - 6)) ^ - (block >> 13) ^ - (block << (hash_shift - 12))) & (table->hash_size - 1); + return ((hash << (hash_shift - 6)) ^ + (hash >> 13) ^ + (hash << (hash_shift - 12))) & (table->hash_size - 1); } -static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, +static int insert_revoke_hash(journal_t *journal, sector_t blocknr, tid_t seq) { struct list_head *hash_list; @@ -146,7 +147,7 @@ oom: /* Find a revoke record in the journal's hash table. */ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long blocknr) + sector_t blocknr) { struct list_head *hash_list; struct jbd_revoke_record_s *record; @@ -325,7 +326,7 @@ void journal_destroy_revoke(journal_t *j * by one. */ -int journal_revoke(handle_t *handle, unsigned long blocknr, +int journal_revoke(handle_t *handle, sector_t blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; @@ -394,7 +395,8 @@ int journal_revoke(handle_t *handle, uns } } - jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n", + blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -584,9 +586,17 @@ static void write_one_revoke_record(jour *descriptorp = descriptor; } - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = - cpu_to_be32(record->blocknr); - offset += 4; + if (JFS_HAS_INCOMPAT_FEATURE(journal, JFS_FEATURE_INCOMPAT_64BIT)) { + * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = + cpu_to_be64(record->blocknr); + offset += 8; + + } else { + * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + cpu_to_be32(record->blocknr); + offset += 4; + } + *offsetp = offset; } @@ -641,7 +651,7 @@ static void flush_descriptor(journal_t * */ int journal_set_revoke(journal_t *journal, - unsigned long blocknr, + sector_t blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; @@ -665,7 +675,7 @@ int journal_set_revoke(journal_t *journa */ int journal_test_revoke(journal_t *journal, - unsigned long blocknr, + sector_t blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; diff -puN include/asm-h8300/types.h~ext3dev-2.6.17-git13 include/asm-h8300/types.h --- linux-2.6.17-git13/include/asm-h8300/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:20.994860503 -0700 +++ linux-2.6.17-git13-ming/include/asm-h8300/types.h 2006-06-28 17:45:21.101848227 -0700 @@ -57,6 +57,7 @@ typedef u32 dma_addr_t; #define HAVE_SECTOR_T typedef u64 sector_t; +#define SECTOR_FMT "%llu" #define HAVE_BLKCNT_T typedef u64 blkcnt_t; diff -puN include/asm-i386/types.h~ext3dev-2.6.17-git13 include/asm-i386/types.h --- linux-2.6.17-git13/include/asm-i386/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:20.997860159 -0700 +++ linux-2.6.17-git13-ming/include/asm-i386/types.h 2006-06-28 17:45:21.102848112 -0700 @@ -59,6 +59,7 @@ typedef u64 dma64_addr_t; #ifdef CONFIG_LBD typedef u64 sector_t; +#define SECTOR_FMT "%llu" #define HAVE_SECTOR_T #endif diff -puN include/asm-mips/types.h~ext3dev-2.6.17-git13 include/asm-mips/types.h --- linux-2.6.17-git13/include/asm-mips/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.000859814 -0700 +++ linux-2.6.17-git13-ming/include/asm-mips/types.h 2006-06-28 17:45:21.102848112 -0700 @@ -95,6 +95,11 @@ typedef unsigned long phys_t; #ifdef CONFIG_LBD typedef u64 sector_t; +#if (_MIPS_SZLONG == 64) +#define SECTOR_FMT "%lu" +#else +#define SECTOR_FMT "%llu" +#endif #define HAVE_SECTOR_T #endif diff -puN include/asm-powerpc/types.h~ext3dev-2.6.17-git13 include/asm-powerpc/types.h --- linux-2.6.17-git13/include/asm-powerpc/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.004859355 -0700 +++ linux-2.6.17-git13-ming/include/asm-powerpc/types.h 2006-06-28 17:45:21.103847997 -0700 @@ -99,6 +99,11 @@ typedef struct { #ifdef CONFIG_LBD typedef u64 sector_t; +#ifdef __powerpc64__ +#define SECTOR_FMT "%lu" +#else +#define SECTOR_FMT "%llu" +#endif #define HAVE_SECTOR_T #endif diff -puN include/asm-s390/types.h~ext3dev-2.6.17-git13 include/asm-s390/types.h --- linux-2.6.17-git13/include/asm-s390/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.007859011 -0700 +++ linux-2.6.17-git13-ming/include/asm-s390/types.h 2006-06-28 17:45:21.104847883 -0700 @@ -89,6 +89,11 @@ typedef union { #ifdef CONFIG_LBD typedef u64 sector_t; +#ifndef __s390x__ +#define SECTOR_FMT "%llu" +#else +#define SECTOR_FMT "%lu" +#endif #define HAVE_SECTOR_T #endif diff -puN include/asm-sh/types.h~ext3dev-2.6.17-git13 include/asm-sh/types.h --- linux-2.6.17-git13/include/asm-sh/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.011858552 -0700 +++ linux-2.6.17-git13-ming/include/asm-sh/types.h 2006-06-28 17:45:21.104847883 -0700 @@ -54,6 +54,7 @@ typedef u32 dma_addr_t; #ifdef CONFIG_LBD typedef u64 sector_t; +#define SECTOR_FMT "%llu" #define HAVE_SECTOR_T #endif diff -puN include/asm-x86_64/types.h~ext3dev-2.6.17-git13 include/asm-x86_64/types.h --- linux-2.6.17-git13/include/asm-x86_64/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.014858208 -0700 +++ linux-2.6.17-git13-ming/include/asm-x86_64/types.h 2006-06-28 17:45:21.105847768 -0700 @@ -49,6 +49,7 @@ typedef u64 dma64_addr_t; typedef u64 dma_addr_t; typedef u64 sector_t; +#define SECTOR_FMT "%llu" #define HAVE_SECTOR_T #endif /* __ASSEMBLY__ */ diff -puN /dev/null include/linux/ext3_fs_extents.h --- /dev/null 2006-06-28 00:02:13.345547960 -0700 +++ linux-2.6.17-git13-ming/include/linux/ext3_fs_extents.h 2006-06-28 17:45:21.106847653 -0700 @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _LINUX_EXT3_EXTENTS +#define _LINUX_EXT3_EXTENTS + +#include + +/* + * with AGRESSIVE_TEST defined capacity of index/leaf blocks + * become very little, so index split, in-depth growing and + * other hard changes happens much more often + * this is for debug purposes only + */ +#define AGRESSIVE_TEST_ + +/* + * with EXTENTS_STATS defined number of blocks and extents + * are collected in truncate path. they'll be showed at + * umount time + */ +#define EXTENTS_STATS__ + +/* + * if CHECK_BINSEARCH defined, then results of binary search + * will be checked by linear search + */ +#define CHECK_BINSEARCH__ + +/* + * if EXT_DEBUG is defined you can use 'extdebug' mount option + * to get lots of info what's going on + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(a...) printk(a) +#else +#define ext_debug(a...) +#endif + +/* + * if EXT_STATS is defined then stats numbers are collected + * these number will be displayed at umount time + */ +#define EXT_STATS_ + + +/* + * ext3_inode has i_block array (60 bytes total) + * first 12 bytes store ext3_extent_header + * the remain stores array of ext3_extent + */ + +/* + * this is extent on-disk structure + * it's used at the bottom of the tree + */ +struct ext3_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start; /* low 32 bigs of physical block */ +}; + +/* + * this is index on-disk structure + * it's used at all the levels, but the bottom + */ +struct ext3_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf; /* pointer to the physical block of the next * + * level. leaf or next index could bet here */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * each block (leaves and indexes), even inode-stored has header + */ +struct ext3_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlaying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT3_EXT_MAGIC cpu_to_le16(0xf30a) + +/* + * array of ext3_ext_path contains path to some extent + * creation/lookup routines use it for traversal/splitting/etc + * truncate uses it to simulate recursive walking + */ +struct ext3_ext_path { + ext3_fsblk_t p_block; + __u16 p_depth; + struct ext3_extent *p_ext; + struct ext3_extent_idx *p_idx; + struct ext3_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +#define EXT3_EXT_CACHE_NO 0 +#define EXT3_EXT_CACHE_GAP 1 +#define EXT3_EXT_CACHE_EXTENT 2 + +/* + * to be called by ext3_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext3_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, struct ext3_ext_path *, + struct ext3_ext_cache *, + void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + + +#define EXT_MAX_BLOCK 0xffffffff + +#define EXT_MAX_LEN ((1UL << 15) - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext3_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + +static inline struct ext3_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext3_extent_header *) EXT3_I(inode)->i_data; +} + +static inline struct ext3_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext3_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void ext3_ext_tree_changed(struct inode *inode) +{ + EXT3_I(inode)->i_ext_generation++; +} + +static inline void +ext3_ext_invalidate_cache(struct inode *inode) +{ + EXT3_I(inode)->i_cached_extent.ec_type = EXT3_EXT_CACHE_NO; +} + +extern int ext3_extent_tree_init(handle_t *, struct inode *); +extern int ext3_ext_calc_credits_for_insert(struct inode *, struct ext3_ext_path *); +extern int ext3_ext_insert_extent(handle_t *, struct inode *, struct ext3_ext_path *, struct ext3_extent *); +extern int ext3_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *); +extern struct ext3_ext_path * ext3_ext_find_extent(struct inode *, int, struct ext3_ext_path *); + +#endif /* _LINUX_EXT3_EXTENTS */ + diff -puN include/linux/ext3_fs.h~ext3dev-2.6.17-git13 include/linux/ext3_fs.h --- linux-2.6.17-git13/include/linux/ext3_fs.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.041855111 -0700 +++ linux-2.6.17-git13-ming/include/linux/ext3_fs.h 2006-06-28 17:45:21.109847309 -0700 @@ -17,6 +17,7 @@ #define _LINUX_EXT3_FS_H #include +#include /* * The second extended filesystem constants/structures @@ -135,6 +136,57 @@ struct ext3_group_desc __le32 bg_reserved[3]; }; +#ifdef __KERNEL__ +#include +#include +static inline u32 EXT3_RELATIVE_ENCODE(ext3_fsblk_t group_base, + ext3_fsblk_t fs_block) +{ + s32 gdp_block; + + if (fs_block < (1ULL<<32) && group_base < (1ULL<<32)) + return fs_block; + + gdp_block = (fs_block - group_base); + BUG_ON ((group_base + gdp_block) != fs_block); + + return gdp_block; +} + +static inline ext3_fsblk_t EXT3_RELATIVE_DECODE(ext3_fsblk_t group_base, + u32 gdp_block) +{ + if (group_base >= (1ULL<<32)) + return group_base + (s32) gdp_block; + + if ((s32) gdp_block >= 0 && gdp_block < group_base && + group_base + gdp_block >= (1ULL<<32)) + return group_base + gdp_block; + + return gdp_block; +} + +#define EXT3_BLOCK_BITMAP(bg, group_base) \ + EXT3_RELATIVE_DECODE(group_base, le32_to_cpu((bg)->bg_block_bitmap)) +#define EXT3_INODE_BITMAP(bg, group_base) \ + EXT3_RELATIVE_DECODE(group_base, le32_to_cpu((bg)->bg_inode_bitmap)) +#define EXT3_INODE_TABLE(bg, group_base) \ + EXT3_RELATIVE_DECODE(group_base, le32_to_cpu((bg)->bg_inode_table)) + +#define EXT3_BLOCK_BITMAP_SET(bg, group_base, value) \ + do {(bg)->bg_block_bitmap = EXT3_RELATIVE_ENCODE(group_base, value);} while(0) +#define EXT3_INODE_BITMAP_SET(bg, group_base, value) \ + do {(bg)->bg_inode_bitmap = EXT3_RELATIVE_ENCODE(group_base, value);} while(0) +#define EXT3_INODE_TABLE_SET(bg, group_base, value) \ + do {(bg)->bg_inode_table = EXT3_RELATIVE_ENCODE(group_base, value);} while(0) + +#define EXT3_IS_USED_BLOCK_BITMAP(bg) \ + ((bg)->bg_block_bitmap != 0) +#define EXT3_IS_USED_INODE_BITMAP(bg) \ + ((bg)->bg_inode_bitmap != 0) +#define EXT3_IS_USED_INODE_TABLE(bg) \ + ((bg)->bg_inode_table != 0) +#endif /* * Macro-instructions used to manage group descriptors */ @@ -182,8 +234,9 @@ struct ext3_group_desc #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ /* @@ -283,7 +336,7 @@ struct ext3_inode { struct { __u8 l_i_frag; /* Fragment number */ __u8 l_i_fsize; /* Fragment size */ - __u16 i_pad1; + __u16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __u32 l_i_reserved2; @@ -299,7 +352,7 @@ struct ext3_inode { struct { __u8 m_i_frag; /* Fragment number */ __u8 m_i_fsize; /* Fragment size */ - __u16 m_pad1; + __u16 m_i_file_acl_high; __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ @@ -313,6 +366,7 @@ struct ext3_inode { #define i_reserved1 osd1.linux1.l_i_reserved1 #define i_frag osd2.linux2.l_i_frag #define i_fsize osd2.linux2.l_i_fsize +#define i_file_acl_high osd2.linux2.l_i_file_acl_high #define i_uid_low i_uid #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high @@ -333,6 +387,7 @@ struct ext3_inode { #define i_reserved1 osd1.masix1.m_i_reserved1 #define i_frag osd2.masix2.m_i_frag #define i_fsize osd2.masix2.m_i_fsize +#define i_file_acl_high osd2.masix2.m_i_file_acl_high #define i_reserved2 osd2.masix2.m_i_reserved2 #endif /* defined(__KERNEL__) || defined(__linux__) */ @@ -371,6 +426,7 @@ struct ext3_inode { #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT3_MOUNT_EXTENTS 0x400000 /* Extents support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -476,14 +532,43 @@ struct ext3_super_block { __u8 s_def_hash_version; /* Default hash version to use */ __u8 s_reserved_char_pad; __u16 s_reserved_word_pad; - __le32 s_default_mount_opts; +/*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ - __u32 s_reserved[190]; /* Padding to the end of the block */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT3_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __u32 s_reserved[169]; /* Padding to the end of the block */ }; + +#define EXT3_BLOCKS_COUNT(s) \ + (ext3_fsblk_t)(((__u64)le32_to_cpu((s)->s_blocks_count_hi) << 32) | \ + (__u64)le32_to_cpu((s)->s_blocks_count)) +#define EXT3_BLOCKS_COUNT_SET(s,v) do { \ + (s)->s_blocks_count = cpu_to_le32((v)); \ + (s)->s_blocks_count_hi = cpu_to_le32(((__u64)(v)) >> 32); \ +} while (0) + +#define EXT3_R_BLOCKS_COUNT(s) \ + (ext3_fsblk_t)(((__u64)le32_to_cpu((s)->s_r_blocks_count_hi) << 32) | \ + (__u64)le32_to_cpu((s)->s_r_blocks_count)) +#define EXT3_R_BLOCKS_COUNT_SET(s,v) do { \ + (s)->s_r_blocks_count = cpu_to_le32((v)); \ + (s)->s_r_blocks_count_hi = cpu_to_le32(((__u64)(v)) >> 32); \ +} while (0) + +#define EXT3_FREE_BLOCKS_COUNT(s) \ + (ext3_fsblk_t)(((__u64)le32_to_cpu((s)->s_free_blocks_count_hi) << 32) | \ + (__u64)le32_to_cpu((s)->s_free_blocks_count)) +#define EXT3_FREE_BLOCKS_COUNT_SET(s,v) do { \ + (s)->s_free_blocks_count = cpu_to_le32((v)); \ + (s)->s_free_blocks_count_hi = cpu_to_le32(((__u64)(v)) >> 32); \ +} while (0) + #ifdef __KERNEL__ -#include -#include static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) { return sb->s_fs_info; @@ -560,11 +645,15 @@ static inline struct ext3_inode_info *EX #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT3_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ EXT3_FEATURE_INCOMPAT_RECOVER| \ - EXT3_FEATURE_INCOMPAT_META_BG) + EXT3_FEATURE_INCOMPAT_META_BG| \ + EXT3_FEATURE_INCOMPAT_EXTENTS| \ + EXT3_FEATURE_INCOMPAT_64BIT) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) @@ -724,6 +813,27 @@ ext3_group_first_block_no(struct super_b #define ERR_BAD_DX_DIR -75000 /* + * This function calculate the block group number and offset, + * given a block number + */ + +static inline void ext3_get_group_no_and_offset(struct super_block * sb, + ext3_fsblk_t blocknr, unsigned long* blockgrpp, + ext3_grpblk_t *offsetp) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + ext3_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = sector_div(blocknr, EXT3_BLOCKS_PER_GROUP(sb)); + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +/* * Function prototypes */ @@ -736,6 +846,10 @@ ext3_group_first_block_no(struct super_b # define NORET_AND noreturn, /* balloc.c */ +extern unsigned int ext3_block_group(struct super_block *sb, + ext3_fsblk_t blocknr); +extern ext3_grpblk_t ext3_block_group_offset(struct super_block *sb, + ext3_fsblk_t blocknr); extern int ext3_bg_has_super(struct super_block *sb, int group); extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, @@ -803,6 +917,9 @@ extern int ext3_get_inode_loc(struct ino extern void ext3_truncate (struct inode *); extern void ext3_set_inode_flags(struct inode *); extern void ext3_set_aops(struct inode *inode); +extern int ext3_writepage_trans_blocks(struct inode *); +extern int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from); /* ioctl.c */ extern int ext3_ioctl (struct inode *, struct file *, unsigned int, @@ -856,6 +973,26 @@ extern struct inode_operations ext3_spec extern struct inode_operations ext3_symlink_inode_operations; extern struct inode_operations ext3_fast_symlink_inode_operations; +/* extents.c */ +extern int ext3_ext_tree_init(handle_t *handle, struct inode *); +extern int ext3_ext_writepage_trans_blocks(struct inode *, int); +extern int ext3_ext_get_blocks(handle_t *, struct inode *, sector_t, + unsigned long, struct buffer_head *, int, int); +extern void ext3_ext_truncate(struct inode *, struct page *); +extern void ext3_ext_init(struct super_block *); +extern void ext3_ext_release(struct super_block *); +static inline int +ext3_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, + unsigned long max_blocks, struct buffer_head *bh, + int create, int extend_disksize) +{ + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_get_blocks(handle, inode, block, max_blocks, + bh, create, extend_disksize); + return ext3_get_blocks_handle(handle, inode, block, max_blocks, bh, + create, extend_disksize); +} + #endif /* __KERNEL__ */ diff -puN include/linux/ext3_fs_i.h~ext3dev-2.6.17-git13 include/linux/ext3_fs_i.h --- linux-2.6.17-git13/include/linux/ext3_fs_i.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.044854766 -0700 +++ linux-2.6.17-git13-ming/include/linux/ext3_fs_i.h 2006-06-28 17:45:21.110847194 -0700 @@ -25,9 +25,9 @@ typedef int ext3_grpblk_t; /* data type for filesystem-wide blocks number */ -typedef unsigned long ext3_fsblk_t; +typedef sector_t ext3_fsblk_t; -#define E3FSBLK "%lu" +#define E3FSBLK SECTOR_FMT struct ext3_reserve_window { ext3_fsblk_t _rsv_start; /* First byte reserved */ @@ -65,6 +65,16 @@ struct ext3_block_alloc_info { #define rsv_end rsv_window._rsv_end /* + * storage for cached extent + */ +struct ext3_ext_cache { + ext3_fsblk_t ec_start; + __u32 ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* * third extended file system inode data in memory */ struct ext3_inode_info { @@ -142,6 +152,9 @@ struct ext3_inode_info { */ struct mutex truncate_mutex; struct inode vfs_inode; + + unsigned long i_ext_generation; + struct ext3_ext_cache i_cached_extent; }; #endif /* _LINUX_EXT3_FS_I */ diff -puN include/linux/ext3_fs_sb.h~ext3dev-2.6.17-git13 include/linux/ext3_fs_sb.h --- linux-2.6.17-git13/include/linux/ext3_fs_sb.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.048854307 -0700 +++ linux-2.6.17-git13-ming/include/linux/ext3_fs_sb.h 2006-06-28 17:45:21.110847194 -0700 @@ -78,6 +78,16 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + +#ifdef EXTENTS_STATS + /* ext3 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif }; #endif /* _LINUX_EXT3_FS_SB */ diff -puN include/linux/ext3_jbd.h~ext3dev-2.6.17-git13 include/linux/ext3_jbd.h --- linux-2.6.17-git13/include/linux/ext3_jbd.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.052853848 -0700 +++ linux-2.6.17-git13-ming/include/linux/ext3_jbd.h 2006-06-28 17:45:21.111847079 -0700 @@ -26,9 +26,14 @@ * * We may have to touch one inode, one bitmap buffer, up to three * indirection blocks, the group and superblock summaries, and the data - * block to complete the transaction. */ - -#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify upto + * 5 levels of tree + root which is stored in inode. */ + +#define EXT3_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS) \ + || test_opt(sb, EXTENTS) ? 27U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -42,7 +47,7 @@ * superblock only gets updated once, of course, so don't bother * counting that again for the quota updates. */ -#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ +#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT3_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT3_QUOTA_TRANS_BLOCKS(sb)) @@ -78,9 +83,9 @@ /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) + (EXT3_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) #define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) + (EXT3_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else #define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT3_QUOTA_INIT_BLOCKS(sb) 0 @@ -149,7 +154,7 @@ __ext3_journal_forget(const char *where, static inline int __ext3_journal_revoke(const char *where, handle_t *handle, - unsigned long blocknr, struct buffer_head *bh) + ext3_fsblk_t blocknr, struct buffer_head *bh) { int err = journal_revoke(handle, blocknr, bh); if (err) diff -puN include/linux/jbd.h~ext3dev-2.6.17-git13 include/linux/jbd.h --- linux-2.6.17-git13/include/linux/jbd.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.055853504 -0700 +++ linux-2.6.17-git13-ming/include/linux/jbd.h 2006-06-28 17:45:21.114846735 -0700 @@ -147,14 +147,21 @@ typedef struct journal_header_s /* - * The block tag: used to describe a single buffer in the journal + * The block tag: used to describe a single buffer in the journal. + * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this + * raw struct shouldn't be used for pointer math or sizeof() - use + * journal_tag_bytes(journal) instead to compute this. */ typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ __be32 t_flags; /* See below */ + __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; +#define JBD_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high)) +#define JBD_TAG_SIZE64 (sizeof(journal_block_tag_t)) + /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log @@ -232,11 +239,13 @@ typedef struct journal_superblock_s ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) #define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 +#define JFS_FEATURE_INCOMPAT_64BIT 0x00000002 /* Features known to this kernel version: */ #define JFS_KNOWN_COMPAT_FEATURES 0 #define JFS_KNOWN_ROCOMPAT_FEATURES 0 -#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE +#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \ + JFS_FEATURE_INCOMPAT_64BIT) #ifdef __KERNEL__ @@ -729,7 +738,7 @@ struct journal_s */ struct block_device *j_dev; int j_blocksize; - unsigned int j_blk_offset; + sector_t j_blk_offset; /* * Device which holds the client fs. For internal journal this will be @@ -848,7 +857,7 @@ extern void __journal_clean_data_list(tr /* Log buffer allocation */ extern struct journal_head * journal_get_descriptor_buffer(journal_t *); -int journal_next_log_block(journal_t *, unsigned long *); +int journal_next_log_block(journal_t *, sector_t *); /* Commit management */ extern void journal_commit_transaction(journal_t *); @@ -863,7 +872,7 @@ extern int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - int blocknr); + sector_t blocknr); /* Transaction locking */ extern void __wait_on_journal (journal_t *); @@ -911,7 +920,7 @@ extern void journal_unlock_updates (jou extern journal_t * journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, - int start, int len, int bsize); + sector_t start, int len, int bsize); extern journal_t * journal_init_inode (struct inode *); extern int journal_update_format (journal_t *); extern int journal_check_used_features @@ -932,7 +941,7 @@ extern void journal_abort (journ extern int journal_errno (journal_t *); extern void journal_ack_err (journal_t *); extern int journal_clear_err (journal_t *); -extern int journal_bmap(journal_t *, unsigned long, unsigned long *); +extern int journal_bmap(journal_t *, unsigned long, sector_t *); extern int journal_force_commit(journal_t *); /* @@ -965,14 +974,13 @@ extern void journal_destroy_revoke_ca extern int journal_init_revoke_caches(void); extern void journal_destroy_revoke(journal_t *); -extern int journal_revoke (handle_t *, - unsigned long, struct buffer_head *); +extern int journal_revoke (handle_t *, sector_t, struct buffer_head *); extern int journal_cancel_revoke(handle_t *, struct journal_head *); extern void journal_write_revoke_records(journal_t *, transaction_t *); /* Recovery revoke support */ -extern int journal_set_revoke(journal_t *, unsigned long, tid_t); -extern int journal_test_revoke(journal_t *, unsigned long, tid_t); +extern int journal_set_revoke(journal_t *, sector_t, tid_t); +extern int journal_test_revoke(journal_t *, sector_t, tid_t); extern void journal_clear_revoke(journal_t *); extern void journal_brelse_array(struct buffer_head *b[], int n); extern void journal_switch_revoke_table(journal_t *journal); @@ -1050,6 +1058,7 @@ static inline int tid_geq(tid_t x, tid_t } extern int journal_blocks_per_page(struct inode *inode); +extern size_t journal_tag_bytes(journal_t *journal); /* * Return the minimum number of blocks which must be free in the journal diff -puN include/linux/types.h~ext3dev-2.6.17-git13 include/linux/types.h --- linux-2.6.17-git13/include/linux/types.h~ext3dev-2.6.17-git13 2006-06-28 17:45:21.058853160 -0700 +++ linux-2.6.17-git13-ming/include/linux/types.h 2006-06-28 17:45:21.114846735 -0700 @@ -134,6 +134,7 @@ typedef __s64 int64_t; */ #ifndef HAVE_SECTOR_T typedef unsigned long sector_t; +#define SECTOR_FMT "%lu" #endif #ifndef HAVE_BLKCNT_T _